From c2cad0997bb68d5963f912d23f6aa747d0d0f52e Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Wed, 2 Oct 2024 08:08:47 +0300 Subject: [PATCH 1/3] First version with cuda 12.6, trt 10 --- src/Detector/tensorrt_yolo/CMakeLists.txt | 14 +- src/Detector/tensorrt_yolo/YoloONNX.cpp | 29 +- .../tensorrt_yolo/common/BatchStream.h | 47 +- .../tensorrt_yolo/common/EntropyCalibrator.h | 18 +- .../tensorrt_yolo/common/ErrorRecorder.h | 9 +- .../tensorrt_yolo/common/argsParser.h | 162 + .../tensorrt_yolo/common/bfloat16.cpp | 60 + src/Detector/tensorrt_yolo/common/bfloat16.h | 46 + src/Detector/tensorrt_yolo/common/buffers.h | 164 +- src/Detector/tensorrt_yolo/common/common.h | 330 +- .../tensorrt_yolo/common/dumpTFWts.py | 124 + .../tensorrt_yolo/common/fileLock.cpp | 100 + src/Detector/tensorrt_yolo/common/fileLock.h | 86 + .../tensorrt_yolo/common/getOptions.cpp | 248 + .../tensorrt_yolo/common/getOptions.h | 128 + src/Detector/tensorrt_yolo/common/getopt.c | 568 +++ src/Detector/tensorrt_yolo/common/getoptWin.h | 124 + src/Detector/tensorrt_yolo/common/half.h | 9 +- src/Detector/tensorrt_yolo/common/logger.cpp | 7 +- src/Detector/tensorrt_yolo/common/logger.h | 5 +- src/Detector/tensorrt_yolo/common/logging.h | 16 +- .../tensorrt_yolo/common/parserOnnxConfig.h | 56 +- .../tensorrt_yolo/common/safeCommon.h | 321 +- .../tensorrt_yolo/common/sampleConfig.h | 50 +- .../tensorrt_yolo/common/sampleDevice.cpp | 133 + .../tensorrt_yolo/common/sampleDevice.h | 142 +- .../tensorrt_yolo/common/sampleEngines.cpp_ | 1688 +++++++ .../tensorrt_yolo/common/sampleEngines.h | 296 +- .../tensorrt_yolo/common/sampleEntrypoints.h | 101 + .../tensorrt_yolo/common/sampleInference.cpp_ | 1622 +++++++ .../tensorrt_yolo/common/sampleInference.h | 226 +- .../tensorrt_yolo/common/sampleOptions.cpp | 2081 ++++++-- .../tensorrt_yolo/common/sampleOptions.h | 236 +- .../tensorrt_yolo/common/sampleReporting.cpp | 300 +- .../tensorrt_yolo/common/sampleReporting.h | 124 +- .../tensorrt_yolo/common/sampleUtils.cpp | 587 +++ .../tensorrt_yolo/common/sampleUtils.h | 528 +- .../tensorrt_yolo/common/streamReader.h | 78 + .../tensorrt_yolo/common/timingCache.cpp | 157 + .../tensorrt_yolo/common/timingCache.h | 38 + .../common_deprecated/BatchStream.h | 388 ++ .../common_deprecated/EntropyCalibrator.h | 134 + .../common_deprecated/ErrorRecorder.h | 137 + .../tensorrt_yolo/common_deprecated/buffers.h | 478 ++ .../tensorrt_yolo/common_deprecated/common.h | 963 ++++ .../tensorrt_yolo/common_deprecated/half.h | 4302 +++++++++++++++++ .../common_deprecated/logger.cpp | 40 + .../tensorrt_yolo/common_deprecated/logger.h | 36 + .../tensorrt_yolo/common_deprecated/logging.h | 578 +++ .../common_deprecated/parserOnnxConfig.h | 153 + .../common_deprecated/safeCommon.h | 71 + .../common_deprecated/sampleConfig.h | 337 ++ .../common_deprecated/sampleDevice.h | 494 ++ .../sampleEngines.cpp | 0 .../common_deprecated/sampleEngines.h | 183 + .../sampleInference.cpp | 0 .../common_deprecated/sampleInference.h | 92 + .../common_deprecated/sampleOptions.cpp | 1778 +++++++ .../common_deprecated/sampleOptions.h | 355 ++ .../common_deprecated/sampleReporting.cpp | 445 ++ .../common_deprecated/sampleReporting.h | 222 + .../common_deprecated/sampleUtils.h | 543 +++ src/Detector/tensorrt_yolo/yolo.cpp | 63 +- src/Detector/tensorrt_yolo/yolo.h | 1 + 64 files changed, 21126 insertions(+), 1725 deletions(-) create mode 100644 src/Detector/tensorrt_yolo/common/argsParser.h create mode 100644 src/Detector/tensorrt_yolo/common/bfloat16.cpp create mode 100644 src/Detector/tensorrt_yolo/common/bfloat16.h create mode 100644 src/Detector/tensorrt_yolo/common/dumpTFWts.py create mode 100644 src/Detector/tensorrt_yolo/common/fileLock.cpp create mode 100644 src/Detector/tensorrt_yolo/common/fileLock.h create mode 100644 src/Detector/tensorrt_yolo/common/getOptions.cpp create mode 100644 src/Detector/tensorrt_yolo/common/getOptions.h create mode 100644 src/Detector/tensorrt_yolo/common/getopt.c create mode 100644 src/Detector/tensorrt_yolo/common/getoptWin.h create mode 100644 src/Detector/tensorrt_yolo/common/sampleDevice.cpp create mode 100644 src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ create mode 100644 src/Detector/tensorrt_yolo/common/sampleEntrypoints.h create mode 100644 src/Detector/tensorrt_yolo/common/sampleInference.cpp_ create mode 100644 src/Detector/tensorrt_yolo/common/sampleUtils.cpp create mode 100644 src/Detector/tensorrt_yolo/common/streamReader.h create mode 100644 src/Detector/tensorrt_yolo/common/timingCache.cpp create mode 100644 src/Detector/tensorrt_yolo/common/timingCache.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/buffers.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/common.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/half.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logger.cpp create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logger.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logging.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h rename src/Detector/tensorrt_yolo/{common => common_deprecated}/sampleEngines.cpp (100%) create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h rename src/Detector/tensorrt_yolo/{common => common_deprecated}/sampleInference.cpp (100%) create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h create mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h diff --git a/src/Detector/tensorrt_yolo/CMakeLists.txt b/src/Detector/tensorrt_yolo/CMakeLists.txt index 30509d0e..30f916bf 100644 --- a/src/Detector/tensorrt_yolo/CMakeLists.txt +++ b/src/Detector/tensorrt_yolo/CMakeLists.txt @@ -58,13 +58,20 @@ file(GLOB TENSORRT_CUDA_FILES *.cu) cuda_add_library(${libname_rt} SHARED ${TENSORRT_CUDA_FILES} ${TENSORRT_SOURCE_FILES} - ${TENSORRT_HEADER_FILES} -) + ${TENSORRT_HEADER_FILES}) #message("TensorRT OpenCV libraries:") #message("${OpenCV_LIBS}") #message(${OpenCV_DIR}) +if (MSVC) + file(GLOB TensorRT_LIBRARIES ${TensorRT_LIBRARY}) +endif() + +message("TensorRT_LIBRARY: ${TensorRT_LIBRARY}") +message("TensorRT_LIBRARIES: ${TensorRT_LIBRARIES}") + + set(TENSORRT_LIBS ${OpenCV_LIBS} #${CUDA_LIBRARIES} @@ -74,8 +81,7 @@ set(TENSORRT_LIBS ${CUDA_curand_LIBRARY} ${CUDNN_LIBRARY} # ${LIB_PTHREAD} - ${TensorRT_LIBRARIES} -) + ${TensorRT_LIBRARIES}) if (CMAKE_COMPILER_IS_GNUCXX) set(TENSORRT_LIBS ${TENSORRT_LIBS} stdc++fs nvinfer_plugin nvonnxparser) diff --git a/src/Detector/tensorrt_yolo/YoloONNX.cpp b/src/Detector/tensorrt_yolo/YoloONNX.cpp index b016c4b3..0b19d5cc 100644 --- a/src/Detector/tensorrt_yolo/YoloONNX.cpp +++ b/src/Detector/tensorrt_yolo/YoloONNX.cpp @@ -22,14 +22,13 @@ bool YoloONNX::Init(const SampleYoloParams& params) auto GetBindings = [&]() { - auto numBindings = m_engine->getNbBindings(); + auto numBindings = m_engine->getNbIOTensors(); std::cout << "** Bindings: " << numBindings << " **" << std::endl; for (int32_t i = 0; i < numBindings; ++i) { - nvinfer1::Dims dim = m_engine->getBindingDimensions(i); - - std::string bindName = m_engine->getBindingName(i); + std::string bindName = m_engine->getIOTensorName(i); + nvinfer1::Dims dim = m_engine->getTensorShape(bindName.c_str()); for (const auto& outName : m_params.outputTensorNames) { if (bindName == outName) @@ -77,27 +76,17 @@ bool YoloONNX::Init(const SampleYoloParams& params) delete infer; #endif - sample::gLogInfo << "TRT Engine loaded from: " << m_params.engineFileName << std::endl; - - GetBindings(); - - if (!m_engine) + if (m_engine) { - res = false; + GetBindings(); + m_inputDims = m_engine->getTensorShape(m_engine->getIOTensorName(0)); + res = true; } else { -#if 1 - m_inputDims = m_engine->getBindingDimensions(0); -#else - m_inputDims.nbDims = 4; - m_inputDims.d[0] = m_params.explicitBatchSize; - m_inputDims.d[1] = 3; - m_inputDims.d[2] = m_params.width; - m_inputDims.d[3] = m_params.height; -#endif res = true; } + sample::gLogInfo << "TRT Engine loaded from: " << m_params.engineFileName << " with res = " << res << std::endl; } else { @@ -177,7 +166,7 @@ bool YoloONNX::ConstructNetwork(YoloONNXUniquePtr& builder, size_t dlaGlobalDRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM); std::cout << "workspaceSize = " << workspaceSize << ", dlaManagedSRAMSize = " << dlaManagedSRAMSize << ", dlaLocalDRAMSize = " << dlaLocalDRAMSize << ", dlaGlobalDRAMSize = " << dlaGlobalDRAMSize << std::endl; - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : 4096_MiB); + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : (1 << 20)); #endif config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); diff --git a/src/Detector/tensorrt_yolo/common/BatchStream.h b/src/Detector/tensorrt_yolo/common/BatchStream.h index a8da9923..c4ab9de0 100644 --- a/src/Detector/tensorrt_yolo/common/BatchStream.h +++ b/src/Detector/tensorrt_yolo/common/BatchStream.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -119,7 +120,7 @@ class MNISTBatchStream : public IBatchStream file.read(reinterpret_cast(rawData.data()), numElements * sizeof(uint8_t)); mData.resize(numElements); std::transform( - rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast(val) / 255.f; }); + rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast(val) / 255.F; }); } void readLabelsFile(const std::string& labelsFilePath) @@ -152,42 +153,39 @@ class MNISTBatchStream : public IBatchStream class BatchStream : public IBatchStream { public: - BatchStream( - int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector directories) + BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::string const& suffix, + std::vector const& directories) : mBatchSize(batchSize) , mMaxBatches(maxBatches) , mPrefix(prefix) , mSuffix(suffix) , mDataDir(directories) { - FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb"); - ASSERT(file != nullptr); + std::ifstream file(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), std::ios::binary); + ASSERT(file.good()); int d[4]; - size_t readSize = fread(d, sizeof(int), 4, file); - ASSERT(readSize == 4); + file.read(reinterpret_cast(d), 4 * sizeof(int32_t)); mDims.nbDims = 4; // The number of dimensions. mDims.d[0] = d[0]; // Batch Size mDims.d[1] = d[1]; // Channels mDims.d[2] = d[2]; // Height mDims.d[3] = d[3]; // Width ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0); - fclose(file); mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; mBatch.resize(mBatchSize * mImageSize, 0); mLabels.resize(mBatchSize, 0); mFileBatch.resize(mDims.d[0] * mImageSize, 0); mFileLabels.resize(mDims.d[0], 0); - reset(0); } - BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector directories) + BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::vector const& directories) : BatchStream(batchSize, maxBatches, prefix, ".batch", directories) { } - BatchStream( - int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector directories) + BatchStream(int batchSize, int maxBatches, nvinfer1::Dims const& dims, std::string const& listFile, + std::vector const& directories) : mBatchSize(batchSize) , mMaxBatches(maxBatches) , mDims(dims) @@ -199,7 +197,6 @@ class BatchStream : public IBatchStream mLabels.resize(mBatchSize, 0); mFileBatch.resize(mDims.d[0] * mImageSize, 0); mFileLabels.resize(mDims.d[0], 0); - reset(0); } // Resets data members @@ -219,7 +216,7 @@ class BatchStream : public IBatchStream return false; } - for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize) + for (int64_t csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize) { ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]); if (mFileBatchPos == mDims.d[0] && !update()) @@ -228,7 +225,7 @@ class BatchStream : public IBatchStream } // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer. - csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); + csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); std::copy_n( getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize); std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos); @@ -295,22 +292,16 @@ class BatchStream : public IBatchStream if (mListFile.empty()) { std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir); - FILE* file = fopen(inputFileName.c_str(), "rb"); + std::ifstream file(inputFileName.c_str(), std::ios::binary); if (!file) { return false; } - int d[4]; - size_t readSize = fread(d, sizeof(int), 4, file); - ASSERT(readSize == 4); + file.read(reinterpret_cast(d), 4 * sizeof(int32_t)); ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]); - size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file); - ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize)); - size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file); - ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0])); - - fclose(file); + file.read(reinterpret_cast(getFileBatch()), sizeof(float) * mDims.d[0] * mImageSize); + file.read(reinterpret_cast(getFileLabels()), sizeof(float) * mDims.d[0]); } else { @@ -368,7 +359,7 @@ class BatchStream : public IBatchStream return true; } - int mBatchSize{0}; + int64_t mBatchSize{0}; int mMaxBatches{0}; int mBatchCount{0}; int mFileCount{0}; diff --git a/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h b/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h index f31789bf..67a0130e 100644 --- a/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h +++ b/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -28,8 +29,8 @@ template class EntropyCalibratorImpl { public: - EntropyCalibratorImpl( - TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true) + EntropyCalibratorImpl(TBatchStream const& stream, int firstBatch, std::string const& networkName, + const char* inputBlobName, bool readCache = true) : mStream{stream} , mCalibrationTableName("CalibrationTable" + networkName) , mInputBlobName(inputBlobName) @@ -51,11 +52,12 @@ class EntropyCalibratorImpl return mStream.getBatchSize(); } - bool getBatch(void* bindings[], const char* names[], int /*nbBindings*/) noexcept + bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept { if (!mStream.next()) + { return false; - + } CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice)); ASSERT(!strcmp(names[0], mInputBlobName)); bindings[0] = mDeviceInput; @@ -101,8 +103,8 @@ template class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: - Int8EntropyCalibrator2( - TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true) + Int8EntropyCalibrator2(TBatchStream const& stream, int32_t firstBatch, const char* networkName, + const char* inputBlobName, bool readCache = true) : mImpl(stream, firstBatch, networkName, inputBlobName, readCache) { } diff --git a/src/Detector/tensorrt_yolo/common/ErrorRecorder.h b/src/Detector/tensorrt_yolo/common/ErrorRecorder.h index 40b35fb5..bfb857c5 100644 --- a/src/Detector/tensorrt_yolo/common/ErrorRecorder.h +++ b/src/Detector/tensorrt_yolo/common/ErrorRecorder.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,7 +17,7 @@ #ifndef ERROR_RECORDER_H #define ERROR_RECORDER_H -#include "NvInferRuntimeCommon.h" +#include "NvInferRuntime.h" #include "logger.h" #include #include @@ -44,7 +45,7 @@ class SampleErrorRecorder : public IErrorRecorder public: SampleErrorRecorder() = default; - virtual ~SampleErrorRecorder() noexcept {} + ~SampleErrorRecorder() noexcept override {} int32_t getNbErrors() const noexcept final { return mErrorStack.size(); diff --git a/src/Detector/tensorrt_yolo/common/argsParser.h b/src/Detector/tensorrt_yolo/common/argsParser.h new file mode 100644 index 00000000..1f0b9025 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/argsParser.h @@ -0,0 +1,162 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_ARGS_PARSER_H +#define TENSORRT_ARGS_PARSER_H + +#ifdef _MSC_VER +#include "getOptWin.h" +#else +#include +#endif +#include +#include +#include + +namespace samplesCommon +{ + +//! +//! \brief The SampleParams structure groups the basic parameters required by +//! all sample networks. +//! +struct SampleParams +{ + int32_t batchSize{1}; //!< Number of inputs in a batch + int32_t dlaCore{-1}; //!< Specify the DLA core to run network on. + bool int8{false}; //!< Allow runnning the network in Int8 mode. + bool fp16{false}; //!< Allow running the network in FP16 mode. + bool bf16{false}; //!< Allow running the network in BF16 mode. + std::vector dataDirs; //!< Directory paths where sample data files are stored + std::vector inputTensorNames; + std::vector outputTensorNames; + std::string timingCacheFile; //!< Path to timing cache file +}; + +//! +//! \brief The OnnxSampleParams structure groups the additional parameters required by +//! networks that use ONNX +//! +struct OnnxSampleParams : public SampleParams +{ + std::string onnxFileName; //!< Filename of ONNX file of a network +}; + +//! +//! /brief Struct to maintain command-line arguments. +//! +struct Args +{ + bool runInInt8{false}; + bool runInFp16{false}; + bool runInBf16{false}; + bool help{false}; + int32_t useDLACore{-1}; + int32_t batch{1}; + std::vector dataDirs; + std::string saveEngine; + std::string loadEngine; + bool rowOrder{true}; + std::string timingCacheFile; +}; + +//! +//! \brief Populates the Args struct with the provided command-line parameters. +//! +//! \throw invalid_argument if any of the arguments are not valid +//! +//! \return boolean If return value is true, execution can continue, otherwise program should exit +//! +inline bool parseArgs(Args& args, int32_t argc, char* argv[]) +{ + while (1) + { + int32_t arg; + static struct option long_options[] + = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'}, {"int8", no_argument, 0, 'i'}, + {"fp16", no_argument, 0, 'f'}, {"bf16", no_argument, 0, 'z'}, {"columnOrder", no_argument, 0, 'c'}, + {"saveEngine", required_argument, 0, 's'}, {"loadEngine", required_argument, 0, 'o'}, + {"useDLACore", required_argument, 0, 'u'}, {"batch", required_argument, 0, 'b'}, + {"timingCacheFile", required_argument, 0, 't'}, {nullptr, 0, nullptr, 0}}; + int32_t option_index = 0; + arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index); + if (arg == -1) + { + break; + } + + switch (arg) + { + case 'h': args.help = true; return true; + case 'd': + if (optarg) + { + args.dataDirs.push_back(optarg); + } + else + { + std::cerr << "ERROR: --datadir requires option argument" << std::endl; + return false; + } + break; + case 's': + if (optarg) + { + args.saveEngine = optarg; + } + break; + case 'o': + if (optarg) + { + args.loadEngine = optarg; + } + break; + case 'i': args.runInInt8 = true; break; + case 'f': args.runInFp16 = true; break; + case 'z': args.runInBf16 = true; break; + case 'c': args.rowOrder = false; break; + case 'u': + if (optarg) + { + args.useDLACore = std::stoi(optarg); + } + break; + case 'b': + if (optarg) + { + args.batch = std::stoi(optarg); + } + break; + case 't': + if (optarg) + { + args.timingCacheFile = optarg; + } + else + { + std::cerr << "ERROR: --timingCacheFile requires option argument" << std::endl; + return false; + } + break; + default: return false; + } + } + return true; +} + +} // namespace samplesCommon + +#endif // TENSORRT_ARGS_PARSER_H diff --git a/src/Detector/tensorrt_yolo/common/bfloat16.cpp b/src/Detector/tensorrt_yolo/common/bfloat16.cpp new file mode 100644 index 00000000..8222826a --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/bfloat16.cpp @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bfloat16.h" +#include + +namespace sample +{ + +BFloat16::operator float() const +{ + static_assert(sizeof(uint32_t) == sizeof(float), ""); + float val{0.F}; + auto bits = static_cast(mRep) << 16; + std::memcpy(&val, &bits, sizeof(uint32_t)); + return val; +} + +BFloat16::BFloat16(float x) +{ + static_assert(sizeof(uint32_t) == sizeof(float), ""); + uint32_t bits{0}; + std::memcpy(&bits, &x, sizeof(float)); + + // FP32 format: 1 sign bit, 8 bit exponent, 23 bit mantissa + // BF16 format: 1 sign bit, 8 bit exponent, 7 bit mantissa + + // Mask for exponent + constexpr uint32_t exponent = 0xFFU << 23; + + // Check if exponent is all 1s (NaN or infinite) + if ((bits & exponent) != exponent) + { + // x is finite - round to even + bits += 0x7FFFU + (bits >> 16 & 1); + } + + mRep = static_cast(bits >> 16); +} + +BFloat16 operator+(BFloat16 x, BFloat16 y) +{ + return BFloat16(static_cast(x) + static_cast(y)); +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/bfloat16.h b/src/Detector/tensorrt_yolo/common/bfloat16.h new file mode 100644 index 00000000..0d0ab922 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/bfloat16.h @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace sample +{ + +//! Implements "Brain Floating Point": like an IEEE FP32, +//! but the significand is only 7 bits instead of 23 bits. +class BFloat16 +{ +public: + BFloat16() + : mRep(0) + { + } + + // Rounds to even if there is a tie. + BFloat16(float x); + + operator float() const; + +private: + //! Value stored in BFloat16 representation. + uint16_t mRep; +}; +BFloat16 operator+(BFloat16 x, BFloat16 y); + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/buffers.h b/src/Detector/tensorrt_yolo/common/buffers.h index ef673b2b..e58f2f5c 100644 --- a/src/Detector/tensorrt_yolo/common/buffers.h +++ b/src/Detector/tensorrt_yolo/common/buffers.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -238,28 +239,53 @@ class BufferManager public: static const size_t kINVALID_SIZE_VALUE = ~size_t(0); + //! + //! \brief Create a BufferManager for handling buffer interactions with engine, when the I/O tensor volumes + //! are provided + //! + BufferManager( + std::shared_ptr engine, std::vector const& volumes, int32_t batchSize = 0) + : mEngine(engine) + , mBatchSize(batchSize) + { + // Create host and device buffers + for (int32_t i = 0; i < mEngine->getNbIOTensors(); i++) + { + auto const name = engine->getIOTensorName(i); + mNames[name] = i; + + nvinfer1::DataType type = mEngine->getTensorDataType(name); + + std::unique_ptr manBuf{new ManagedBuffer()}; + manBuf->deviceBuffer = DeviceBuffer(volumes[i], type); + manBuf->hostBuffer = HostBuffer(volumes[i], type); + void* deviceBuffer = manBuf->deviceBuffer.data(); + mDeviceBindings.emplace_back(deviceBuffer); + mManagedBuffers.emplace_back(std::move(manBuf)); + } + } + //! //! \brief Create a BufferManager for handling buffer interactions with engine. //! - BufferManager(std::shared_ptr engine, const int batchSize, - const nvinfer1::IExecutionContext* context = nullptr) + BufferManager(std::shared_ptr engine, int32_t const batchSize = 0, + nvinfer1::IExecutionContext const* context = nullptr) : mEngine(engine) , mBatchSize(batchSize) { - // Full Dims implies no batch size. - auto impbs = engine->hasImplicitBatchDimension(); - std::cout << "hasImplicitBatchDimension: " << impbs << ", mBatchSize = " << mBatchSize << std::endl; - assert(engine->hasImplicitBatchDimension() || mBatchSize == 0); // Create host and device buffers - for (int i = 0; i < mEngine->getNbBindings(); i++) + for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; i++) { - auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i); + auto const name = engine->getIOTensorName(i); + mNames[name] = i; + + auto dims = context ? context->getTensorShape(name) : mEngine->getTensorShape(name); size_t vol = context || !mBatchSize ? 1 : static_cast(mBatchSize); - nvinfer1::DataType type = mEngine->getBindingDataType(i); - int vecDim = mEngine->getBindingVectorizedDim(i); + nvinfer1::DataType type = mEngine->getTensorDataType(name); + int32_t vecDim = mEngine->getTensorVectorizedDim(name); if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector { - int scalarsPerVec = mEngine->getBindingComponentsPerElement(i); + int32_t scalarsPerVec = mEngine->getTensorComponentsPerElement(name); dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec); vol *= scalarsPerVec; } @@ -267,7 +293,8 @@ class BufferManager std::unique_ptr manBuf{new ManagedBuffer()}; manBuf->deviceBuffer = DeviceBuffer(vol, type); manBuf->hostBuffer = HostBuffer(vol, type); - mDeviceBindings.emplace_back(manBuf->deviceBuffer.data()); + void* deviceBuffer = manBuf->deviceBuffer.data(); + mDeviceBindings.emplace_back(deviceBuffer); mManagedBuffers.emplace_back(std::move(manBuf)); } } @@ -284,7 +311,7 @@ class BufferManager //! //! \brief Returns a vector of device buffers. //! - const std::vector& getDeviceBindings() const + std::vector const& getDeviceBindings() const { return mDeviceBindings; } @@ -293,7 +320,7 @@ class BufferManager //! \brief Returns the device buffer corresponding to tensorName. //! Returns nullptr if no such tensor can be found. //! - void* getDeviceBuffer(const std::string& tensorName) const + void* getDeviceBuffer(std::string const& tensorName) const { return getBuffer(false, tensorName); } @@ -302,72 +329,21 @@ class BufferManager //! \brief Returns the host buffer corresponding to tensorName. //! Returns nullptr if no such tensor can be found. //! - void* getHostBuffer(const std::string& tensorName) const + void* getHostBuffer(std::string const& tensorName) const { return getBuffer(true, tensorName); } - //! - //! \brief Returns the host buffer corresponding to tensorName. - //! Returns nullptr if no such tensor can be found. - //! - void* getHostBuffer(int bindingIndex) const - { - return getBuffer(true, bindingIndex); - } - //! //! \brief Returns the size of the host and device buffers that correspond to tensorName. //! Returns kINVALID_SIZE_VALUE if no such tensor can be found. //! - size_t size(const std::string& tensorName) const + size_t size(std::string const& tensorName) const { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) + auto record = mNames.find(tensorName); + if (record == mNames.end()) return kINVALID_SIZE_VALUE; - return mManagedBuffers[index]->hostBuffer.nbBytes(); - } - - //! - //! \brief Dump host buffer with specified tensorName to ostream. - //! Prints error message to std::ostream if no such tensor can be found. - //! - void dumpBuffer(std::ostream& os, const std::string& tensorName) - { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) - { - os << "Invalid tensor name" << std::endl; - return; - } - void* buf = mManagedBuffers[index]->hostBuffer.data(); - size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes(); - nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index); - size_t rowCount = static_cast(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize); - int leadDim = mBatchSize; - int* trailDims = bufDims.d; - int nbDims = bufDims.nbDims; - - // Fix explicit Dimension networks - if (!leadDim && nbDims > 0) - { - leadDim = bufDims.d[0]; - ++trailDims; - --nbDims; - } - - os << "[" << leadDim; - for (int i = 0; i < nbDims; i++) - os << ", " << trailDims[i]; - os << "]" << std::endl; - switch (mEngine->getBindingDataType(index)) - { - case nvinfer1::DataType::kINT32: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kFLOAT: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kHALF: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break; - case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break; - } + return mManagedBuffers[record->second]->hostBuffer.nbBytes(); } //! @@ -382,7 +358,7 @@ class BufferManager assert(bufSize % sizeof(T) == 0); T* typedBuf = static_cast(buf); size_t numItems = bufSize / sizeof(T); - for (int i = 0; i < static_cast(numItems); i++) + for (int32_t i = 0; i < static_cast(numItems); i++) { // Handle rowCount == 1 case if (rowCount == 1 && i != static_cast(numItems) - 1) @@ -404,7 +380,7 @@ class BufferManager //! void copyInputToDevice() { - memcpyBuffers(true, false, false, 0); + memcpyBuffers(true, false, false); } //! @@ -412,13 +388,13 @@ class BufferManager //! void copyOutputToHost() { - memcpyBuffers(false, true, false, 0); + memcpyBuffers(false, true, false); } //! //! \brief Copy the contents of input host buffers to input device buffers asynchronously. //! - void copyInputToDeviceAsync(const cudaStream_t& stream) + void copyInputToDeviceAsync(cudaStream_t const& stream = 0) { memcpyBuffers(true, false, true, stream); } @@ -426,7 +402,7 @@ class BufferManager //! //! \brief Copy the contents of output device buffers to output host buffers asynchronously. //! - void copyOutputToHostAsync(const cudaStream_t& stream) + void copyOutputToHostAsync(cudaStream_t const& stream = 0) { memcpyBuffers(false, true, true, stream); } @@ -434,30 +410,31 @@ class BufferManager ~BufferManager() = default; private: - void* getBuffer(const bool isHost, const std::string& tensorName) const + void* getBuffer(bool const isHost, std::string const& tensorName) const { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) + auto record = mNames.find(tensorName); + if (record == mNames.end()) return nullptr; - return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data()); + return (isHost ? mManagedBuffers[record->second]->hostBuffer.data() + : mManagedBuffers[record->second]->deviceBuffer.data()); } - void* getBuffer(const bool isHost, int bindingIndex) const + bool tenosrIsInput(const std::string& tensorName) const { - if (bindingIndex == -1) - return nullptr; - return (isHost ? mManagedBuffers[bindingIndex]->hostBuffer.data() : mManagedBuffers[bindingIndex]->deviceBuffer.data()); + return mEngine->getTensorIOMode(tensorName.c_str()) == nvinfer1::TensorIOMode::kINPUT; } - void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream) + void memcpyBuffers(bool const copyInput, bool const deviceToHost, bool const async, cudaStream_t const& stream = 0) { - for (int i = 0; i < mEngine->getNbBindings(); i++) + for (auto const& n : mNames) { - void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data(); - const void* srcPtr = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data(); - const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes(); + void* dstPtr = deviceToHost ? mManagedBuffers[n.second]->hostBuffer.data() + : mManagedBuffers[n.second]->deviceBuffer.data(); + void const* srcPtr = deviceToHost ? mManagedBuffers[n.second]->deviceBuffer.data() + : mManagedBuffers[n.second]->hostBuffer.data(); + size_t const byteSize = mManagedBuffers[n.second]->hostBuffer.nbBytes(); const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice; - if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i))) + if ((copyInput && tenosrIsInput(n.first)) || (!copyInput && !tenosrIsInput(n.first))) { if (async) CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream)); @@ -468,9 +445,10 @@ class BufferManager } std::shared_ptr mEngine; //!< The pointer to the engine - int mBatchSize = 0; //!< The batch size for legacy networks, 0 otherwise. + int mBatchSize; //!< The batch size for legacy networks, 0 otherwise. std::vector> mManagedBuffers; //!< The vector of pointers to managed buffers - std::vector mDeviceBindings; //!< The vector of device buffers needed for engine execution + std::vector mDeviceBindings; //!< The vector of device buffers needed for engine execution + std::unordered_map mNames; //!< The map of tensor name and index pairs }; } // namespace samplesCommon diff --git a/src/Detector/tensorrt_yolo/common/common.h b/src/Detector/tensorrt_yolo/common/common.h index 2270a2cd..538c6094 100644 --- a/src/Detector/tensorrt_yolo/common/common.h +++ b/src/Detector/tensorrt_yolo/common/common.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,22 +17,13 @@ #ifndef TENSORRT_COMMON_H #define TENSORRT_COMMON_H - -// For loadLibrary -#ifdef _MSC_VER -// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#undef NOMINMAX -#else -#include -#endif - #include "NvInfer.h" +#if !TRT_WINML #include "NvInferPlugin.h" +#endif #include "logger.h" +#include "safeCommon.h" +#include "timingCache.h" #include #include #include @@ -39,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -52,7 +45,15 @@ #include #include -#include "safeCommon.h" +#ifdef _MSC_VER +// For loadLibrary +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include +#endif #ifdef _MSC_VER #define FN_NAME __FUNCTION__ @@ -82,7 +83,7 @@ if (!(condition)) \ { \ sample::gLogError << "Assertion failure: " << #condition << std::endl; \ - abort(); \ + exit(EXIT_FAILURE); \ } \ } while (0) @@ -96,7 +97,7 @@ OBJ_GUARD(T) makeObjGuard(T_* t) { CHECK(!(std::is_base_of::value || std::is_same::value)); - auto deleter = [](T* t) { t->destroy(); }; + auto deleter = [](T* t) { delete t; }; return std::unique_ptr{static_cast(t), deleter}; } @@ -113,21 +114,6 @@ constexpr long double operator"" _KiB(long double val) return val * (1 << 10); } -// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB. -// Since the return type is signed, -1_GiB will work as expected. -constexpr long long int operator"" _GiB(unsigned long long val) -{ - return val * (1 << 30); -} -constexpr long long int operator"" _MiB(unsigned long long val) -{ - return val * (1 << 20); -} -constexpr long long int operator"" _KiB(unsigned long long val) -{ - return val * (1 << 10); -} - struct SimpleProfiler : public nvinfer1::IProfiler { struct Record @@ -136,7 +122,7 @@ struct SimpleProfiler : public nvinfer1::IProfiler int count{0}; }; - virtual void reportLayerTime(const char* layerName, float ms) noexcept + void reportLayerTime(const char* layerName, float ms) noexcept override { mProfile[layerName].count++; mProfile[layerName].time += ms; @@ -183,7 +169,7 @@ struct SimpleProfiler : public nvinfer1::IProfiler auto old_precision = out.precision(); // Output header { - out << std::setw(maxLayerNameLength) << layerNameStr << " "; + out << std::setfill(' ') << std::setw(maxLayerNameLength) << layerNameStr << " "; out << std::setw(12) << "Runtime, " << "%" << " "; @@ -214,80 +200,12 @@ struct SimpleProfiler : public nvinfer1::IProfiler std::map mProfile; }; -//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in. -//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path. -inline std::string locateFile( - const std::string& filepathSuffix, const std::vector& directories, bool reportError = true) -{ - const int MAX_DEPTH{10}; - bool found{false}; - std::string filepath; - - for (auto& dir : directories) - { - if (!dir.empty() && dir.back() != '/') - { -#ifdef _MSC_VER - filepath = dir + "\\" + filepathSuffix; -#else - filepath = dir + "/" + filepathSuffix; -#endif - } - else - { - filepath = dir + filepathSuffix; - } - - for (int i = 0; i < MAX_DEPTH && !found; i++) - { - const std::ifstream checkFile(filepath); - found = checkFile.is_open(); - if (found) - { - break; - } - - filepath = "../" + filepath; // Try again in parent dir - } - - if (found) - { - break; - } - - filepath.clear(); - } - - // Could not find the file - if (filepath.empty()) - { - const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), - [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); - std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl; - - if (reportError) - { - std::cout << "&&&& FAILED" << std::endl; - exit(EXIT_FAILURE); - } - } - - return filepath; -} - -inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW) -{ - std::ifstream infile(fileName, std::ifstream::binary); - assert(infile.is_open() && "Attempting to read from a file that is not open."); - std::string magic, h, w, max; - infile >> magic >> h >> w >> max; - infile.seekg(1, infile.cur); - infile.read(reinterpret_cast(buffer), inH * inW); -} - namespace samplesCommon { - +using nvinfer1::utils::loadTimingCacheFile; +using nvinfer1::utils::buildTimingCacheFromFile; +using nvinfer1::utils::saveTimingCacheFile; +using nvinfer1::utils::updateTimingCacheFile; // Swaps endianness of an integral type. template ::value, int>::type = 0> inline T swapEndianness(const T& value) @@ -339,7 +257,7 @@ class TypedHostMemory : public HostMemory { mData = new ElemType[size]; }; - ~TypedHostMemory() noexcept + ~TypedHostMemory() noexcept override { delete[](ElemType*) mData; } @@ -360,7 +278,7 @@ inline void* safeCudaMalloc(size_t memSize) if (deviceMem == nullptr) { std::cerr << "Out of memory" << std::endl; - exit(1); + exit(EXIT_FAILURE); } return deviceMem; } @@ -375,25 +293,20 @@ struct InferDeleter template void operator()(T* obj) const { -#if (NV_TENSORRT_MAJOR < 8) - obj->destroy(); -#else delete obj; -#endif } }; template -using SampleUniquePtr = std::unique_ptr; +using SampleUniquePtr = std::unique_ptr; -static auto StreamDeleter = [](cudaStream_t* pStream) +static auto StreamDeleter = [](cudaStream_t* pStream) { + if (pStream) { - if (pStream) - { - cudaStreamDestroy(*pStream); - delete pStream; - } - }; + static_cast(cudaStreamDestroy(*pStream)); + delete pStream; + } +}; inline std::unique_ptr makeCudaStream() { @@ -531,7 +444,7 @@ inline float getMaxValue(const float* buffer, int64_t size) // // The default parameter values choosen arbitrarily. Range values should be choosen such that // we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor. -inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f) +inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0F, float outRange = 4.0F) { // Ensure that all layer inputs have a scale. for (int i = 0; i < network->getNbLayers(); i++) @@ -579,14 +492,15 @@ inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer // Set dummy per-tensor dynamic range if Int8 mode is requested. if (c->getFlag(nvinfer1::BuilderFlag::kINT8)) { - sample::gLogWarning - << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy is not guaranteed." - << std::endl; + sample::gLogWarning << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy " + "is not guaranteed." + << std::endl; setAllDynamicRanges(n); } } -inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) +inline void enableDLA( + nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) { if (useDLACore >= 0) { @@ -627,18 +541,28 @@ inline uint32_t getElementSize(nvinfer1::DataType t) noexcept { switch (t) { - case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kINT64: return 8; + case nvinfer1::DataType::kINT32: case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kBF16: case nvinfer1::DataType::kHALF: return 2; case nvinfer1::DataType::kBOOL: - case nvinfer1::DataType::kINT8: return 1; + case nvinfer1::DataType::kUINT8: + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kFP8: return 1; + case nvinfer1::DataType::kINT4: + ASSERT(false && "Element size is not implemented for sub-byte data-types"); } return 0; } -inline int64_t volume(const nvinfer1::Dims& d) +inline int64_t volume(nvinfer1::Dims const& dims, int32_t start, int32_t stop) { - return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); + ASSERT(start >= 0); + ASSERT(start <= stop); + ASSERT(stop <= dims.nbDims); + ASSERT(std::all_of(dims.d + start, dims.d + stop, [](int32_t x) { return x >= 0; })); + return std::accumulate(dims.d + start, dims.d + stop, int64_t{1}, std::multiplies{}); } template @@ -698,7 +622,7 @@ void writePPMFileWithBBox(const std::string& filename, PPM& ppm, const << ppm.w << " " << ppm.h << "\n" << ppm.max << "\n"; - auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + auto round = [](float x) -> int { return int(std::floor(x + 0.5F)); }; const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1); const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1); const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1); @@ -739,7 +663,7 @@ inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vec << "\n" << ppm.w << " " << ppm.h << "\n" << ppm.max << "\n"; - auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + auto round = [](float x) -> int { return int(std::floor(x + 0.5F)); }; for (auto bbox : dets) { @@ -778,7 +702,7 @@ class TimerBase virtual void stop() {} float microseconds() const noexcept { - return mMs * 1000.f; + return mMs * 1000.F; } float milliseconds() const noexcept { @@ -786,15 +710,15 @@ class TimerBase } float seconds() const noexcept { - return mMs / 1000.f; + return mMs / 1000.F; } void reset() noexcept { - mMs = 0.f; + mMs = 0.F; } protected: - float mMs{0.0f}; + float mMs{0.0F}; }; class GpuTimer : public TimerBase @@ -811,14 +735,14 @@ class GpuTimer : public TimerBase CHECK(cudaEventDestroy(mStart)); CHECK(cudaEventDestroy(mStop)); } - void start() + void start() override { CHECK(cudaEventRecord(mStart, mStream)); } - void stop() + void stop() override { CHECK(cudaEventRecord(mStop, mStream)); - float ms{0.0f}; + float ms{0.0F}; CHECK(cudaEventSynchronize(mStop)); CHECK(cudaEventElapsedTime(&ms, mStart, mStop)); mMs += ms; @@ -835,11 +759,11 @@ class CpuTimer : public TimerBase public: using clock_type = Clock; - void start() + void start() override { mStart = Clock::now(); } - void stop() + void stop() override { mStop = Clock::now(); mMs += std::chrono::duration{mStop - mStart}.count(); @@ -865,13 +789,7 @@ inline std::vector splitString(std::string str, char delimiter = ', return splitVect; } -// Return m rounded up to nearest multiple of n -inline int roundUp(int m, int n) -{ - return ((m + n - 1) / n) * n; -} - -inline int getC(const nvinfer1::Dims& d) +inline int getC(nvinfer1::Dims const& d) { return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1; } @@ -886,54 +804,111 @@ inline int getW(const nvinfer1::Dims& d) return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1; } -inline void loadLibrary(const std::string& path) +//! Platform-agnostic wrapper around dynamic libraries. +class DynamicLibrary { -#ifdef _MSC_VER - void* handle = LoadLibrary(path.c_str()); -#else - int32_t flags{RTLD_LAZY}; +public: + explicit DynamicLibrary(std::string const& name) + : mLibName{name} + { +#if defined(_WIN32) + mHandle = LoadLibraryA(name.c_str()); +#else // defined(_WIN32) + int32_t flags{RTLD_LAZY}; #if ENABLE_ASAN - // https://github.com/google/sanitizers/issues/89 - // asan doesn't handle module unloading correctly and there are no plans on doing - // so. In order to get proper stack traces, don't delete the shared library on - // close so that asan can resolve the symbols correctly. - flags |= RTLD_NODELETE; + // https://github.com/google/sanitizers/issues/89 + // asan doesn't handle module unloading correctly and there are no plans on doing + // so. In order to get proper stack traces, don't delete the shared library on + // close so that asan can resolve the symbols correctly. + flags |= RTLD_NODELETE; #endif // ENABLE_ASAN - void* handle = dlopen(path.c_str(), flags); + mHandle = dlopen(name.c_str(), flags); +#endif // defined(_WIN32) + + if (mHandle == nullptr) + { + std::string errorStr{}; +#if !defined(_WIN32) + errorStr = std::string{" due to "} + std::string{dlerror()}; #endif - if (handle == nullptr) + throw std::runtime_error("Unable to open library: " + name + errorStr); + } + } + + DynamicLibrary(DynamicLibrary const&) = delete; + DynamicLibrary(DynamicLibrary const&&) = delete; + + //! + //! Retrieve a function symbol from the loaded library. + //! + //! \return the loaded symbol on success + //! \throw std::invalid_argument if loading the symbol failed. + //! + template + std::function symbolAddress(char const* name) { -#ifdef _MSC_VER - sample::gLogError << "Could not load plugin library: " << path << std::endl; + if (mHandle == nullptr) + { + throw std::runtime_error("Handle to library is nullptr."); + } + void* ret; +#if defined(_MSC_VER) + ret = static_cast(GetProcAddress(static_cast(mHandle), name)); #else - sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl; + ret = dlsym(mHandle, name); #endif + if (ret == nullptr) + { + std::string const kERROR_MSG(mLibName + ": error loading symbol: " + std::string(name)); + throw std::invalid_argument(kERROR_MSG); + } + return reinterpret_cast(ret); } -} -inline int32_t getSMVersion() -{ - int32_t deviceIndex = 0; - CHECK(cudaGetDevice(&deviceIndex)); + ~DynamicLibrary() + { + try + { +#if defined(_WIN32) + ASSERT(static_cast(FreeLibrary(static_cast(mHandle)))); +#else + ASSERT(dlclose(mHandle) == 0); +#endif + } + catch (...) + { + sample::gLogError << "Unable to close library: " << mLibName << std::endl; + } + } - int32_t major, minor; - CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); - CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); +private: + std::string mLibName{}; //!< Name of the DynamicLibrary + void* mHandle{}; //!< Handle to the DynamicLibrary +}; - return ((major << 8) | minor); +inline std::unique_ptr loadLibrary(std::string const& path) +{ + // make_unique not available until C++14 - we still need to support C++11 builds. + return std::unique_ptr(new DynamicLibrary{path}); } -inline bool isSMSafe() +inline int32_t getMaxPersistentCacheSize() { - const int32_t smVersion = getSMVersion(); - return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 || - smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807; + int32_t deviceIndex{}; + CHECK(cudaGetDevice(&deviceIndex)); + + int32_t maxPersistentL2CacheSize{}; +#if CUDART_VERSION >= 11030 && !TRT_WINML + CHECK(cudaDeviceGetAttribute(&maxPersistentL2CacheSize, cudaDevAttrMaxPersistingL2CacheSize, deviceIndex)); +#endif + + return maxPersistentL2CacheSize; } inline bool isDataTypeSupported(nvinfer1::DataType dataType) { - auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); + auto builder = SampleUniquePtr(createBuilder()); if (!builder) { return false; @@ -947,7 +922,6 @@ inline bool isDataTypeSupported(nvinfer1::DataType dataType) return true; } - } // namespace samplesCommon inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) diff --git a/src/Detector/tensorrt_yolo/common/dumpTFWts.py b/src/Detector/tensorrt_yolo/common/dumpTFWts.py new file mode 100644 index 00000000..70770fbd --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/dumpTFWts.py @@ -0,0 +1,124 @@ +#!/usr/bin/python +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Script to dump TensorFlow weights in TRT v1 and v2 dump format. +# The V1 format is for TensorRT 4.0. The V2 format is for TensorRT 4.0 and later. + +import sys +import struct +import argparse + +try: + import tensorflow as tf + from tensorflow.python import pywrap_tensorflow +except ImportError as err: + sys.stderr.write("""Error: Failed to import module ({})""".format(err)) + sys.exit() + +parser = argparse.ArgumentParser(description="TensorFlow Weight Dumper") + +parser.add_argument( + "-m", + "--model", + required=True, + help="The checkpoint file basename, example basename(model.ckpt-766908.data-00000-of-00001) -> model.ckpt-766908", +) +parser.add_argument("-o", "--output", required=True, help="The weight file to dump all the weights to.") +parser.add_argument("-1", "--wtsv1", required=False, default=False, type=bool, help="Dump the weights in the wts v1.") + +opt = parser.parse_args() + +if opt.wtsv1: + print("Outputting the trained weights in TensorRT's wts v1 format. This format is documented as:") + print("Line 0: ") + print("Line 1-Num: [buffer name] [buffer type] [buffer size] ") +else: + print("Outputting the trained weights in TensorRT's wts v2 format. This format is documented as:") + print("Line 0: ") + print("Line 1-Num: [buffer name] [buffer type] [(buffer shape{e.g. (1, 2, 3)}] ") + +inputbase = opt.model +outputbase = opt.output + + +def float_to_hex(f): + return hex(struct.unpack(" +#include +#include + +namespace nvinfer1 +{ +namespace utils +{ +FileLock::FileLock(ILogger& logger, std::string const& fileName) + : mLogger(logger) + , mFileName(fileName) +{ + std::string lockFileName = mFileName + ".lock"; +#ifdef _MSC_VER + { + std::stringstream ss; + ss << "Trying to set exclusive file lock " << lockFileName << std::endl; + mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str()); + } + // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided + mHandle = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL); + if (mHandle == INVALID_HANDLE_VALUE) + { + throw std::runtime_error("Failed to lock " + lockFileName + "!"); + } +#elif defined(__QNX__) + // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is + // The error generated was 89, which means that the function is not implemented. +#else + mHandle = fopen(lockFileName.c_str(), "wb+"); + if (mHandle == nullptr) + { + throw std::runtime_error("Cannot open " + lockFileName + "!"); + } + { + std::stringstream ss; + ss << "Trying to set exclusive file lock " << lockFileName << std::endl; + mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str()); + } + mDescriptor = fileno(mHandle); + auto ret = lockf(mDescriptor, F_LOCK, 0); + if (ret != 0) + { + mDescriptor = -1; + fclose(mHandle); + throw std::runtime_error("Failed to lock " + lockFileName + "!"); + } +#endif +} + +FileLock::~FileLock() +{ + std::string lockFileName = mFileName + ".lock"; +#ifdef _MSC_VER + if (mHandle != INVALID_HANDLE_VALUE) + { + CloseHandle(mHandle); + } +#elif defined(__QNX__) + // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is + // The error generated was 89 + // That means : Function not implemented +#else + if (mDescriptor != -1) + { + auto ret = lockf(mDescriptor, F_ULOCK, 0); + if (mHandle != nullptr) + { + fclose(mHandle); + } + if (ret != 0) + { + std::stringstream ss; + ss << "Failed to unlock " << lockFileName << ", please remove " << lockFileName << ".lock manually!" + << std::endl; + mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str()); + } + } +#endif +} +} // namespace utils +} // namespace nvinfer1 diff --git a/src/Detector/tensorrt_yolo/common/fileLock.h b/src/Detector/tensorrt_yolo/common/fileLock.h new file mode 100644 index 00000000..d0f64a5b --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/fileLock.h @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_SAMPLES_COMMON_FILELOCK_H_ +#define TENSORRT_SAMPLES_COMMON_FILELOCK_H_ +#include "NvInfer.h" +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include // fileno +#include // lockf +#endif +#include + +namespace nvinfer1 +{ +namespace utils +{ +//! +//! \brief RAII object that locks a the specified file. +//! +//! The FileLock class uses a lock file to specify that the +//! current file is being used by a TensorRT tool or sample +//! so that things like the TimingCache can be updated across +//! processes without having conflicts. +//! +class FileLock +{ +public: + FileLock(nvinfer1::ILogger& logger, std::string const& fileName); + ~FileLock(); + FileLock() = delete; // no default ctor + FileLock(FileLock const&) = delete; // no copy ctor + FileLock& operator=(FileLock const&) = delete; // no copy assignment + FileLock(FileLock&&) = delete; // no move ctor + FileLock& operator=(FileLock&&) = delete; // no move assignment + +private: + //! + //! The logger that emits any error messages that might show up. + //! + nvinfer1::ILogger& mLogger; + + //! + //! The filename that the FileLock is protecting from multiple + //! TensorRT processes from writing to. + //! + std::string const mFileName; + +#ifdef _MSC_VER + //! + //! The file handle on windows for the file lock. + //! + HANDLE mHandle{}; +#else + //! + //! The file handle on linux for the file lock. + //! + FILE* mHandle{}; + //! + //! The file descriptor on linux of the file lock. + //! + int32_t mDescriptor{-1}; +#endif +}; // class FileLock +} // namespace utils +} // namespace nvinfer1 + +#endif // TENSORRT_SAMPLES_COMMON_FILELOCK_H_ diff --git a/src/Detector/tensorrt_yolo/common/getOptions.cpp b/src/Detector/tensorrt_yolo/common/getOptions.cpp new file mode 100644 index 00000000..19cd3281 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/getOptions.cpp @@ -0,0 +1,248 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "getOptions.h" +#include "logger.h" + +#include +#include +#include +#include +#include + +namespace nvinfer1 +{ +namespace utility +{ + +//! Matching for TRTOptions is defined as follows: +//! +//! If A and B both have longName set, A matches B if and only if A.longName == +//! B.longName and (A.shortName == B.shortName if both have short name set). +//! +//! If A only has shortName set and B only has longName set, then A does not +//! match B. It is assumed that when 2 TRTOptions are compared, one of them is +//! the definition of a TRTOption in the input to getOptions. As such, if the +//! definition only has shortName set, it will never be equal to a TRTOption +//! that does not have shortName set (and same for longName). +//! +//! If A and B both have shortName set but B does not have longName set, A +//! matches B if and only if A.shortName == B.shortName. +//! +//! If A has neither long or short name set, A matches B if and only if B has +//! neither long or short name set. +bool matches(const TRTOption& a, const TRTOption& b) +{ + if (!a.longName.empty() && !b.longName.empty()) + { + if (a.shortName && b.shortName) + { + return (a.longName == b.longName) && (a.shortName == b.shortName); + } + return a.longName == b.longName; + } + + // If only one of them is not set, this will return false anyway. + return a.shortName == b.shortName; +} + +//! getTRTOptionIndex returns the index of a TRTOption in a vector of +//! TRTOptions, -1 if not found. +int getTRTOptionIndex(const std::vector& options, const TRTOption& opt) +{ + for (size_t i = 0; i < options.size(); ++i) + { + if (matches(opt, options[i])) + { + return i; + } + } + return -1; +} + +//! validateTRTOption will return a string containing an error message if options +//! contain non-numeric characters, or if there are duplicate option names found. +//! Otherwise, returns the empty string. +std::string validateTRTOption( + const std::set& seenShortNames, const std::set& seenLongNames, const TRTOption& opt) +{ + if (opt.shortName != 0) + { + if (!std::isalnum(opt.shortName)) + { + return "Short name '" + std::to_string(opt.shortName) + "' is non-alphanumeric"; + } + + if (seenShortNames.find(opt.shortName) != seenShortNames.end()) + { + return "Short name '" + std::to_string(opt.shortName) + "' is a duplicate"; + } + } + + if (!opt.longName.empty()) + { + for (const char& c : opt.longName) + { + if (!std::isalnum(c) && c != '-' && c != '_') + { + return "Long name '" + opt.longName + "' contains characters that are not '-', '_', or alphanumeric"; + } + } + + if (seenLongNames.find(opt.longName) != seenLongNames.end()) + { + return "Long name '" + opt.longName + "' is a duplicate"; + } + } + return ""; +} + +//! validateTRTOptions will return a string containing an error message if any +//! options contain non-numeric characters, or if there are duplicate option +//! names found. Otherwise, returns the empty string. +std::string validateTRTOptions(const std::vector& options) +{ + std::set seenShortNames; + std::set seenLongNames; + for (size_t i = 0; i < options.size(); ++i) + { + const std::string errMsg = validateTRTOption(seenShortNames, seenLongNames, options[i]); + if (!errMsg.empty()) + { + return "Error '" + errMsg + "' at TRTOption " + std::to_string(i); + } + + seenShortNames.insert(options[i].shortName); + seenLongNames.insert(options[i].longName); + } + return ""; +} + +//! parseArgs parses an argument list and returns a TRTParsedArgs with the +//! fields set accordingly. Assumes that options is validated. +//! ErrMsg will be set if: +//! - an argument is null +//! - an argument is empty +//! - an argument does not have option (i.e. "-" and "--") +//! - a short argument has more than 1 character +//! - the last argument in the list requires a value +TRTParsedArgs parseArgs(int argc, const char* const* argv, const std::vector& options) +{ + TRTParsedArgs parsedArgs; + parsedArgs.values.resize(options.size()); + + for (int i = 1; i < argc; ++i) // index of current command-line argument + { + if (argv[i] == nullptr) + { + return TRTParsedArgs{"Null argument at index " + std::to_string(i)}; + } + + const std::string argStr(argv[i]); + if (argStr.empty()) + { + return TRTParsedArgs{"Empty argument at index " + std::to_string(i)}; + } + + // No starting hyphen means it is a positional argument + if (argStr[0] != '-') + { + parsedArgs.positionalArgs.push_back(argStr); + continue; + } + + if (argStr == "-" || argStr == "--") + { + return TRTParsedArgs{"Argument does not specify an option at index " + std::to_string(i)}; + } + + // If only 1 hyphen, char after is the flag. + TRTOption opt{' ', "", false, ""}; + std::string value; + if (argStr[1] != '-') + { + // Must only have 1 char after the hyphen + if (argStr.size() > 2) + { + return TRTParsedArgs{"Short arg contains more than 1 character at index " + std::to_string(i)}; + } + opt.shortName = argStr[1]; + } + else + { + opt.longName = argStr.substr(2); + + // We need to support --foo=bar syntax, so look for '=' + const size_t eqIndex = opt.longName.find('='); + if (eqIndex < opt.longName.size()) + { + value = opt.longName.substr(eqIndex + 1); + opt.longName = opt.longName.substr(0, eqIndex); + } + } + + const int idx = getTRTOptionIndex(options, opt); + if (idx < 0) + { + continue; + } + + if (options[idx].valueRequired) + { + if (!value.empty()) + { + parsedArgs.values[idx].second.push_back(value); + parsedArgs.values[idx].first = parsedArgs.values[idx].second.size(); + continue; + } + + if (i + 1 >= argc) + { + return TRTParsedArgs{"Last argument requires value, but none given"}; + } + + const std::string nextArg(argv[i + 1]); + if (nextArg.size() >= 1 && nextArg[0] == '-') + { + sample::gLogWarning << "Warning: Using '" << nextArg << "' as a value for '" << argStr + << "', Should this be its own flag?" << std::endl; + } + + parsedArgs.values[idx].second.push_back(nextArg); + i += 1; // Next argument already consumed + + parsedArgs.values[idx].first = parsedArgs.values[idx].second.size(); + } + else + { + parsedArgs.values[idx].first += 1; + } + } + return parsedArgs; +} + +TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector& options) +{ + const std::string errMsg = validateTRTOptions(options); + if (!errMsg.empty()) + { + return TRTParsedArgs{errMsg}; + } + return parseArgs(argc, argv, options); +} +} // namespace utility +} // namespace nvinfer1 diff --git a/src/Detector/tensorrt_yolo/common/getOptions.h b/src/Detector/tensorrt_yolo/common/getOptions.h new file mode 100644 index 00000000..4bbf9e27 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/getOptions.h @@ -0,0 +1,128 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_GET_OPTIONS_H +#define TRT_GET_OPTIONS_H + +#include +#include +#include + +namespace nvinfer1 +{ +namespace utility +{ + +//! TRTOption defines a command line option. At least 1 of shortName and longName +//! must be defined. +//! If bool initialization is undefined behavior on your system, valueRequired +//! must also be explicitly defined. +//! helpText is optional. +struct TRTOption +{ + char shortName; //!< Option name in short (single hyphen) form (i.e. -a, -b) + std::string longName; //!< Option name in long (double hyphen) form (i.e. --foo, --bar) + bool valueRequired; //!< True if a value is needed for an option (i.e. -N 4, --foo bar) + std::string helpText; //!< Text to show when printing out the command usage +}; + +//! TRTParsedArgs is returned by getOptions after it has parsed a command line +//! argument list (argv). +//! +//! errMsg is a string containing an error message if any errors occurred. If it +//! is empty, no errors occurred. +//! +//! values stores a vector of pairs for each option (ordered by order in the +//! input). Each pair contains an int (the number of occurrences) and a vector +//! of strings (a list of values). The user should know which of these to use, +//! and which options required values. For non-value options, only occurrences is +//! populated. For value-required options, occurrences == # of values. Values do +//! not need to be unique. +//! +//! positionalArgs stores additional arguments that are passed in without an +//! option (these must not start with a hyphen). +struct TRTParsedArgs +{ + std::string errMsg; + std::vector>> values; + std::vector positionalArgs; +}; + +//! Parse the input arguments passed to main() and extract options as well as +//! positional arguments. +//! +//! Options are supposed to be passed to main() with a preceding hyphen '-'. +//! +//! If there is a single preceding hyphen, there should be exactly 1 character +//! after the hyphen, which is interpreted as the option. +//! +//! If there are 2 preceding hyphens, the entire argument (without the hyphens) +//! is interpreted as the option. +//! +//! If the option requires a value, the next argument is used as the value. +//! +//! Positional arguments must not start with a hyphen. +//! +//! If an argument requires a value, the next argument is interpreted as the +//! value, even if it is the form of a valid option (i.e. --foo --bar will store +//! "--bar" as a value for option "foo" if "foo" requires a value). +//! We also support --name=value syntax. In this case, 'value' would be used as +//! the value, NOT the next argument. +//! +//! For options: +//! { { 'a', "", false }, +//! { 'b', "", false }, +//! { 0, "cee", false }, +//! { 'd', "", true }, +//! { 'e', "", true }, +//! { 'f', "foo", true } } +//! +//! ./main hello world -a -a --cee -d 12 -f 34 +//! and +//! ./main hello world -a -a --cee -d 12 --foo 34 +//! +//! will result in: +//! +//! TRTParsedArgs { +//! errMsg: "", +//! values: { { 2, {} }, +//! { 0, {} }, +//! { 1, {} }, +//! { 1, {"12"} }, +//! { 0, {} }, +//! { 1, {"34"} } } +//! positionalArgs: {"hello", "world"}, +//! } +//! +//! Non-POSIX behavior: +//! - Does not support "-abcde" as a shorthand for "-a -b -c -d -e". Each +//! option must have its own hyphen prefix. +//! - Does not support -e12 as a shorthand for "-e 12". Values MUST be +//! whitespace-separated from the option it is for. +//! +//! @param[in] argc The number of arguments passed to main (including the +//! file name, which is disregarded) +//! @param[in] argv The arguments passed to main (including the file name, +//! which is disregarded) +//! @param[in] options List of TRTOptions to parse +//! @return TRTParsedArgs. See TRTParsedArgs documentation for descriptions of +//! the fields. +TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector& options); +} // namespace utility +} // namespace nvinfer1 + +#endif // TRT_GET_OPTIONS_H diff --git a/src/Detector/tensorrt_yolo/common/getopt.c b/src/Detector/tensorrt_yolo/common/getopt.c new file mode 100644 index 00000000..c1da08b5 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/getopt.c @@ -0,0 +1,568 @@ +/* $OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $ */ +/* $NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $ */ + +/* + * Copyright (c) 2002 Todd C. Miller + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F39502-99-1-0512. + */ +/*- + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "getoptWin.h" +#include +#include +#include +#include +#include +#include + +#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ + +#ifdef REPLACE_GETOPT +int opterr = 1; /* if error message should be printed */ +int optind = 1; /* index into parent argv vector */ +int optopt = '?'; /* character checked for validity */ +#undef optreset /* see getopt.h */ +#define optreset __mingw_optreset +int optreset; /* reset getopt */ +char* optarg; /* argument associated with option */ +#endif + +#define PRINT_ERROR ((opterr) && (*options != ':')) + +#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ +#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ +#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ + +/* return values */ +#define BADCH (int) '?' +#define BADARG ((*options == ':') ? (int) ':' : (int) '?') +#define INORDER (int) 1 + +#ifndef __CYGWIN__ +#define __progname __argv[0] +#else +extern char __declspec(dllimport) * __progname; +#endif + +#ifdef __CYGWIN__ +static char EMSG[] = ""; +#else +#define EMSG "" +#endif + +static int getopt_internal(int, char* const*, char const*, const struct option*, int*, int); +static int parse_long_options(char* const*, char const*, const struct option*, int*, int); +static int gcd(int, int); +static void permute_args(int, int, int, char* const*); + +static char* place = EMSG; /* option letter processing */ + +/* XXX: set optreset to 1 rather than these two */ +static int nonopt_start = -1; /* first non option argument (for permute) */ +static int nonopt_end = -1; /* first option after non options (for permute) */ + +/* Error messages */ +static char const recargchar[] = "option requires an argument -- %c"; +static char const recargstring[] = "option requires an argument -- %s"; +static char const ambig[] = "ambiguous option -- %.*s"; +static char const noarg[] = "option doesn't take an argument -- %.*s"; +static char const illoptchar[] = "unknown option -- %c"; +static char const illoptstring[] = "unknown option -- %s"; + +static void _vwarnx(char const* fmt, va_list ap) +{ + (void) fprintf(stderr, "%s: ", __progname); + if (fmt != NULL) + (void) vfprintf(stderr, fmt, ap); + (void) fprintf(stderr, "\n"); +} + +static void warnx(char const* fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + _vwarnx(fmt, ap); + va_end(ap); +} + +/* + * Compute the greatest common divisor of a and b. + */ +static int gcd(int a, int b) +{ + int c; + + c = a % b; + while (c != 0) + { + a = b; + b = c; + c = a % b; + } + + return (b); +} + +/* + * Exchange the block from nonopt_start to nonopt_end with the block + * from nonopt_end to opt_end (keeping the same order of arguments + * in each block). + */ +static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char* const* nargv) +{ + int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; + char* swap; + + /* + * compute lengths of blocks and number and size of cycles + */ + nnonopts = panonopt_end - panonopt_start; + nopts = opt_end - panonopt_end; + ncycle = gcd(nnonopts, nopts); + cyclelen = (opt_end - panonopt_start) / ncycle; + + for (i = 0; i < ncycle; i++) + { + cstart = panonopt_end + i; + pos = cstart; + for (j = 0; j < cyclelen; j++) + { + if (pos >= panonopt_end) + pos -= nnonopts; + else + pos += nopts; + swap = nargv[pos]; + /* LINTED const cast */ + ((char**) nargv)[pos] = nargv[cstart]; + /* LINTED const cast */ + ((char**) nargv)[cstart] = swap; + } + } +} + +/* + * parse_long_options -- + * Parse long options in argc/argv argument vector. + * Returns -1 if short_too is set and the option does not match long_options. + */ +static int parse_long_options( + char* const* nargv, char const* options, const struct option* long_options, int* idx, int short_too) +{ + char *current_argv, *has_equal; + size_t current_argv_len; + int i, ambiguous, match; + +#define IDENTICAL_INTERPRETATION(_x, _y) \ + (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag \ + && long_options[(_x)].val == long_options[(_y)].val) + + current_argv = place; + match = -1; + ambiguous = 0; + + optind++; + + if ((has_equal = strchr(current_argv, '=')) != NULL) + { + /* argument found (--option=arg) */ + current_argv_len = has_equal - current_argv; + has_equal++; + } + else + current_argv_len = strlen(current_argv); + + for (i = 0; long_options[i].name; i++) + { + /* find matching long option */ + if (strncmp(current_argv, long_options[i].name, current_argv_len)) + continue; + + if (strlen(long_options[i].name) == current_argv_len) + { + /* exact match */ + match = i; + ambiguous = 0; + break; + } + /* + * If this is a known short option, don't allow + * a partial match of a single character. + */ + if (short_too && current_argv_len == 1) + continue; + + if (match == -1) /* partial match */ + match = i; + else if (!IDENTICAL_INTERPRETATION(i, match)) + ambiguous = 1; + } + if (ambiguous) + { + /* ambiguous abbreviation */ + if (PRINT_ERROR) + warnx(ambig, (int) current_argv_len, current_argv); + optopt = 0; + return (BADCH); + } + if (match != -1) + { /* option found */ + if (long_options[match].has_arg == no_argument && has_equal) + { + if (PRINT_ERROR) + warnx(noarg, (int) current_argv_len, current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + return (BADARG); + } + if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument) + { + if (has_equal) + optarg = has_equal; + else if (long_options[match].has_arg == required_argument) + { + /* + * optional argument doesn't use next nargv + */ + optarg = nargv[optind++]; + } + } + if ((long_options[match].has_arg == required_argument) && (optarg == NULL)) + { + /* + * Missing argument; leading ':' indicates no error + * should be generated. + */ + if (PRINT_ERROR) + warnx(recargstring, current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + --optind; + return (BADARG); + } + } + else + { /* unknown option */ + if (short_too) + { + --optind; + return (-1); + } + if (PRINT_ERROR) + warnx(illoptstring, current_argv); + optopt = 0; + return (BADCH); + } + if (idx) + *idx = match; + if (long_options[match].flag) + { + *long_options[match].flag = long_options[match].val; + return (0); + } + else + return (long_options[match].val); +#undef IDENTICAL_INTERPRETATION +} + +/* + * getopt_internal -- + * Parse argc/argv argument vector. Called by user level routines. + */ +static int getopt_internal( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx, int flags) +{ + char const* oli; /* option letter list index */ + int optchar, short_too; + static int posixly_correct = -1; + + if (options == NULL) + return (-1); + + /* + * XXX Some GNU programs (like cvs) set optind to 0 instead of + * XXX using optreset. Work around this braindamage. + */ + if (optind == 0) + optind = optreset = 1; + + /* + * Disable GNU extensions if POSIXLY_CORRECT is set or options + * string begins with a '+'. + * + * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or + * optreset != 0 for GNU compatibility. + */ + if (posixly_correct == -1 || optreset != 0) + posixly_correct = (getenv("POSIXLY_CORRECT") != NULL); + if (*options == '-') + flags |= FLAG_ALLARGS; + else if (posixly_correct || *options == '+') + flags &= ~FLAG_PERMUTE; + if (*options == '+' || *options == '-') + options++; + + optarg = NULL; + if (optreset) + nonopt_start = nonopt_end = -1; +start: + if (optreset || !*place) + { /* update scanning pointer */ + optreset = 0; + if (optind >= nargc) + { /* end of argument vector */ + place = EMSG; + if (nonopt_end != -1) + { + /* do permutation, if we have to */ + permute_args(nonopt_start, nonopt_end, optind, nargv); + optind -= nonopt_end - nonopt_start; + } + else if (nonopt_start != -1) + { + /* + * If we skipped non-options, set optind + * to the first of them. + */ + optind = nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL)) + { + place = EMSG; /* found non-option */ + if (flags & FLAG_ALLARGS) + { + /* + * GNU extension: + * return non-option as argument to option 1 + */ + optarg = nargv[optind++]; + return (INORDER); + } + if (!(flags & FLAG_PERMUTE)) + { + /* + * If no permutation wanted, stop parsing + * at first non-option. + */ + return (-1); + } + /* do permutation */ + if (nonopt_start == -1) + nonopt_start = optind; + else if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, optind, nargv); + nonopt_start = optind - (nonopt_end - nonopt_start); + nonopt_end = -1; + } + optind++; + /* process next argument */ + goto start; + } + if (nonopt_start != -1 && nonopt_end == -1) + nonopt_end = optind; + + /* + * If we have "-" do nothing, if "--" we are done. + */ + if (place[1] != '\0' && *++place == '-' && place[1] == '\0') + { + optind++; + place = EMSG; + /* + * We found an option (--), so if we skipped + * non-options, we have to permute. + */ + if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, optind, nargv); + optind -= nonopt_end - nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + } + + /* + * Check long options if: + * 1) we were passed some + * 2) the arg is not just "-" + * 3) either the arg starts with -- we are getopt_long_only() + */ + if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY))) + { + short_too = 0; + if (*place == '-') + place++; /* --foo long option */ + else if (*place != ':' && strchr(options, *place) != NULL) + short_too = 1; /* could be short option too */ + + optchar = parse_long_options(nargv, options, long_options, idx, short_too); + if (optchar != -1) + { + place = EMSG; + return (optchar); + } + } + + if ((optchar = (int) *place++) == (int) ':' || (optchar == (int) '-' && *place != '\0') + || (oli = strchr(options, optchar)) == NULL) + { + /* + * If the user specified "-" and '-' isn't listed in + * options, return -1 (non-option) as per POSIX. + * Otherwise, it is an unknown option character (or ':'). + */ + if (optchar == (int) '-' && *place == '\0') + return (-1); + if (!*place) + ++optind; + if (PRINT_ERROR) + warnx(illoptchar, optchar); + optopt = optchar; + return (BADCH); + } + if (long_options != NULL && optchar == 'W' && oli[1] == ';') + { + /* -W long-option */ + if (*place) /* no space */ + /* NOTHING */; + else if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else /* white space */ + place = nargv[optind]; + optchar = parse_long_options(nargv, options, long_options, idx, 0); + place = EMSG; + return (optchar); + } + if (*++oli != ':') + { /* doesn't take argument */ + if (!*place) + ++optind; + } + else + { /* takes (optional) argument */ + optarg = NULL; + if (*place) /* no white space */ + optarg = place; + else if (oli[1] != ':') + { /* arg not optional */ + if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else + optarg = nargv[optind]; + } + place = EMSG; + ++optind; + } + /* dump back option letter */ + return (optchar); +} + +#ifdef REPLACE_GETOPT +/* + * getopt -- + * Parse argc/argv argument vector. + * + * [eventually this will replace the BSD getopt] + */ +int getopt(int nargc, char* const* nargv, char const* options) +{ + + /* + * We don't pass FLAG_PERMUTE to getopt_internal() since + * the BSD getopt(3) (unlike GNU) has never done this. + * + * Furthermore, since many privileged programs call getopt() + * before dropping privileges it makes sense to keep things + * as simple (and bug-free) as possible. + */ + return (getopt_internal(nargc, nargv, options, NULL, NULL, 0)); +} +#endif /* REPLACE_GETOPT */ + +/* + * getopt_long -- + * Parse argc/argv argument vector. + */ +int getopt_long(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE)); +} + +/* + * getopt_long_only -- + * Parse argc/argv argument vector. + */ +int getopt_long_only(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE | FLAG_LONGONLY)); +} diff --git a/src/Detector/tensorrt_yolo/common/getoptWin.h b/src/Detector/tensorrt_yolo/common/getoptWin.h new file mode 100644 index 00000000..a1dc6ffa --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/getoptWin.h @@ -0,0 +1,124 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __GETOPT_H__ +/** + * DISCLAIMER + * This file has no copyright assigned and is placed in the Public Domain. + * This file is a part of the w64 mingw-runtime package. + * + * The w64 mingw-runtime package and its code is distributed in the hope that it + * will be useful but WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESSED OR + * IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to + * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#define __GETOPT_H__ + +/* All the headers include this file. */ +#include + +#if defined(WINGETOPT_SHARED_LIB) +#if defined(BUILDING_WINGETOPT_DLL) +#define WINGETOPT_API __declspec(dllexport) +#else +#define WINGETOPT_API __declspec(dllimport) +#endif +#else +#define WINGETOPT_API +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + + WINGETOPT_API extern int optind; /* index of first non-option in argv */ + WINGETOPT_API extern int optopt; /* single option character, as parsed */ + WINGETOPT_API extern int opterr; /* flag to enable built-in diagnostics... */ + /* (user may set to zero, to suppress) */ + + WINGETOPT_API extern char* optarg; /* pointer to argument of current option */ + + extern int getopt(int nargc, char* const* nargv, char const* options); + +#ifdef _BSD_SOURCE +/* + * BSD adds the non-standard `optreset' feature, for reinitialisation + * of `getopt' parsing. We support this feature, for applications which + * proclaim their BSD heritage, before including this header; however, + * to maintain portability, developers are advised to avoid it. + */ +#define optreset __mingw_optreset + extern int optreset; +#endif +#ifdef __cplusplus +} +#endif +/* + * POSIX requires the `getopt' API to be specified in `unistd.h'; + * thus, `unistd.h' includes this header. However, we do not want + * to expose the `getopt_long' or `getopt_long_only' APIs, when + * included in this manner. Thus, close the standard __GETOPT_H__ + * declarations block, and open an additional __GETOPT_LONG_H__ + * specific block, only when *not* __UNISTD_H_SOURCED__, in which + * to declare the extended API. + */ +#endif /* !defined(__GETOPT_H__) */ + +#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) +#define __GETOPT_LONG_H__ + +#ifdef __cplusplus +extern "C" +{ +#endif + + struct option /* specification for a long form option... */ + { + char const* name; /* option name, without leading hyphens */ + int has_arg; /* does it take an argument? */ + int* flag; /* where to save its status, or NULL */ + int val; /* its associated status value */ + }; + + enum /* permitted values for its `has_arg' field... */ + { + no_argument = 0, /* option never takes an argument */ + required_argument, /* option always requires an argument */ + optional_argument /* option may take an argument */ + }; + + extern int getopt_long( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx); + extern int getopt_long_only( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx); +/* + * Previous MinGW implementation had... + */ +#ifndef HAVE_DECL_GETOPT +/* + * ...for the long form API only; keep this for compatibility. + */ +#define HAVE_DECL_GETOPT 1 +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */ diff --git a/src/Detector/tensorrt_yolo/common/half.h b/src/Detector/tensorrt_yolo/common/half.h index 0755c316..b997e7db 100644 --- a/src/Detector/tensorrt_yolo/common/half.h +++ b/src/Detector/tensorrt_yolo/common/half.h @@ -16,13 +16,14 @@ // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -1522,14 +1523,14 @@ class half /// \return incremented half value half& operator++() { - return *this += 1.0f; + return *this += 1.0F; } /// Prefix decrement. /// \return decremented half value half& operator--() { - return *this -= 1.0f; + return *this -= 1.0F; } /// Postfix increment. diff --git a/src/Detector/tensorrt_yolo/common/logger.cpp b/src/Detector/tensorrt_yolo/common/logger.cpp index 03c64398..909ec0bb 100644 --- a/src/Detector/tensorrt_yolo/common/logger.cpp +++ b/src/Detector/tensorrt_yolo/common/logger.cpp @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,7 +18,7 @@ #include "logger.h" #include "ErrorRecorder.h" #include "logging.h" - +using namespace nvinfer1; SampleErrorRecorder gRecorder; namespace sample { diff --git a/src/Detector/tensorrt_yolo/common/logger.h b/src/Detector/tensorrt_yolo/common/logger.h index 3069e8e9..8205e457 100644 --- a/src/Detector/tensorrt_yolo/common/logger.h +++ b/src/Detector/tensorrt_yolo/common/logger.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/src/Detector/tensorrt_yolo/common/logging.h b/src/Detector/tensorrt_yolo/common/logging.h index 78732c10..69273a5e 100644 --- a/src/Detector/tensorrt_yolo/common/logging.h +++ b/src/Detector/tensorrt_yolo/common/logging.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,7 +18,7 @@ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H -#include "NvInferRuntimeCommon.h" +#include "NvInferRuntime.h" #include "sampleOptions.h" #include #include @@ -162,7 +163,7 @@ class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream } LogStreamConsumer(const LogStreamConsumer& other) = delete; LogStreamConsumer() = delete; - ~LogStreamConsumer() = default; + ~LogStreamConsumer() override = default; LogStreamConsumer& operator=(const LogStreamConsumer&) = delete; LogStreamConsumer& operator=(LogStreamConsumer&&) = delete; @@ -291,7 +292,7 @@ class Logger : public nvinfer1::ILogger }; //! - //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger + //! \brief Forward-compatible method for retrieving the nvinfer1::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, @@ -353,7 +354,7 @@ class Logger : public nvinfer1::ILogger //! //! \brief Define a test for logging //! - //! \param[in] name The name of the test. This should be a string starting with + //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" @@ -379,7 +380,8 @@ class Logger : public nvinfer1::ILogger static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv) { // Append TensorRT version as info - const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]"; + const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "] [b" + + std::to_string(NV_TENSORRT_BUILD) + "]"; auto cmdline = genCmdlineString(argc, argv); return defineTest(vname, cmdline); } diff --git a/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h b/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h index c92a1420..67ee6c71 100644 --- a/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h +++ b/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -35,15 +36,13 @@ * */ -using namespace std; - class ParserOnnxConfig : public nvonnxparser::IOnnxConfig { protected: - string mModelFilename{}; - string mTextFilename{}; - string mFullTextFilename{}; + std::string mModelFilename{}; + std::string mTextFilename{}; + std::string mFullTextFilename{}; nvinfer1::DataType mModelDtype; nvonnxparser::IOnnxConfig::Verbosity mVerbosity; bool mPrintLayercInfo; @@ -62,8 +61,7 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig #endif } -protected: - ~ParserOnnxConfig() + ~ParserOnnxConfig() override { #ifdef ONNX_DEBUG if (isDebug()) @@ -74,62 +72,62 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig } public: - virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept + void setModelDtype(const nvinfer1::DataType modelDtype) noexcept override { mModelDtype = modelDtype; } - virtual nvinfer1::DataType getModelDtype() const noexcept + nvinfer1::DataType getModelDtype() const noexcept override { return mModelDtype; } - virtual const char* getModelFileName() const noexcept + const char* getModelFileName() const noexcept override { return mModelFilename.c_str(); } - virtual void setModelFileName(const char* onnxFilename) noexcept + void setModelFileName(const char* onnxFilename) noexcept override { - mModelFilename = string(onnxFilename); + mModelFilename = std::string(onnxFilename); } - virtual nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept + nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept override { return mVerbosity; } - virtual void addVerbosity() noexcept + void addVerbosity() noexcept override { ++mVerbosity; } - virtual void reduceVerbosity() noexcept + void reduceVerbosity() noexcept override { --mVerbosity; } - virtual void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept + void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept override { mVerbosity = verbosity; } - virtual const char* getTextFileName() const noexcept + const char* getTextFileName() const noexcept override { return mTextFilename.c_str(); } - virtual void setTextFileName(const char* textFilename) noexcept + void setTextFileName(const char* textFilename) noexcept override { - mTextFilename = string(textFilename); + mTextFilename = std::string(textFilename); } - virtual const char* getFullTextFileName() const noexcept + const char* getFullTextFileName() const noexcept override { return mFullTextFilename.c_str(); } - virtual void setFullTextFileName(const char* fullTextFilename) noexcept + void setFullTextFileName(const char* fullTextFilename) noexcept override { - mFullTextFilename = string(fullTextFilename); + mFullTextFilename = std::string(fullTextFilename); } - virtual bool getPrintLayerInfo() const noexcept + bool getPrintLayerInfo() const noexcept override { return mPrintLayercInfo; } - virtual void setPrintLayerInfo(bool src) noexcept + void setPrintLayerInfo(bool src) noexcept override { mPrintLayercInfo = src; } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() @@ -142,12 +140,6 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig return false; #endif } - - virtual void destroy() noexcept - { - delete this; - } - }; // class ParserOnnxConfig #endif diff --git a/src/Detector/tensorrt_yolo/common/safeCommon.h b/src/Detector/tensorrt_yolo/common/safeCommon.h index 3d84b095..f10aad18 100644 --- a/src/Detector/tensorrt_yolo/common/safeCommon.h +++ b/src/Detector/tensorrt_yolo/common/safeCommon.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,13 +18,32 @@ #ifndef TENSORRT_SAFE_COMMON_H #define TENSORRT_SAFE_COMMON_H -#include "NvInferRuntimeCommon.h" +#include "cuda_runtime.h" +#include "sampleEntrypoints.h" +#include #include +#include #include #include +#include #include #include +// For safeLoadLibrary +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include +#endif +#if IS_QNX_SAFE +#include +#include +#endif // IS_QNX_SAFE + +#undef CHECK #define CHECK(status) \ do \ { \ @@ -31,10 +51,92 @@ if (ret != 0) \ { \ std::cerr << "Cuda failure: " << ret << std::endl; \ - abort(); \ + exit(EXIT_FAILURE); \ } \ } while (0) +#undef SAFE_ASSERT +#define SAFE_ASSERT(condition) \ + do \ + { \ + if (!(condition)) \ + { \ + std::cerr << "Assertion failure: " << #condition << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in. +//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path. +inline std::string locateFile( + const std::string& filepathSuffix, const std::vector& directories, bool reportError = true) +{ + const int MAX_DEPTH{10}; + bool found{false}; + std::string filepath; + + for (auto& dir : directories) + { + if (!dir.empty() && dir.back() != '/') + { +#ifdef _MSC_VER + filepath = dir + "\\" + filepathSuffix; +#else + filepath = dir + "/" + filepathSuffix; +#endif + } + else + { + filepath = dir + filepathSuffix; + } + + for (int i = 0; i < MAX_DEPTH && !found; i++) + { + const std::ifstream checkFile(filepath); + found = checkFile.is_open(); + if (found) + { + break; + } + + filepath = "../" + filepath; // Try again in parent dir + } + + if (found) + { + break; + } + + filepath.clear(); + } + + // Could not find the file + if (filepath.empty()) + { + const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), + [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); + std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl; + + if (reportError) + { + std::cout << "&&&& FAILED" << std::endl; + exit(EXIT_FAILURE); + } + } + + return filepath; +} + +inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int32_t inH, int32_t inW) +{ + std::ifstream infile(fileName, std::ifstream::binary); + SAFE_ASSERT(infile.is_open() && "Attempting to read from a file that is not open."); + std::string magic, w, h, max; + infile >> magic >> w >> h >> max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(buffer), inH * inW); +} + namespace samplesCommon { template @@ -51,11 +153,17 @@ inline uint32_t elementSize(nvinfer1::DataType t) { switch (t) { + case nvinfer1::DataType::kINT64: return 8; case nvinfer1::DataType::kINT32: case nvinfer1::DataType::kFLOAT: return 4; - case nvinfer1::DataType::kHALF: return 2; - case nvinfer1::DataType::kINT8: return 1; - case nvinfer1::DataType::kBOOL: return 1; + case nvinfer1::DataType::kHALF: + case nvinfer1::DataType::kBF16: return 2; + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kUINT8: + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kFP8: return 1; + case nvinfer1::DataType::kINT4: + SAFE_ASSERT(false && "Element size is not implemented for sub-byte data-types"); } return 0; } @@ -66,6 +174,205 @@ inline A divUp(A x, B n) return (x + n - 1) / n; } +inline int64_t volume(nvinfer1::Dims const& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies{}); +} + +//! Return m rounded up to nearest multiple of n +template +inline T1 roundUp(T1 m, T2 n) +{ + static_assert(std::is_integral::value && std::is_integral::value, "arguments must be integers"); + static_assert(std::is_signed::value == std::is_signed::value, "mixed signedness not allowed"); + static_assert(sizeof(T1) >= sizeof(T2), "first type must be as least as wide as second type"); + return ((m + n - 1) / n) * n; +} + +//! comps is the number of components in a vector. Ignored if vecDim < 0. +inline int64_t volume(nvinfer1::Dims dims, int32_t vecDim, int32_t comps, int32_t batch) +{ + if (vecDim >= 0) + { + dims.d[vecDim] = roundUp(dims.d[vecDim], comps); + } + return samplesCommon::volume(dims) * std::max(batch, 1); +} + +inline int32_t getSMVersion() +{ +#if 0 + // Use default value for 4090 + int32_t major{8}; + int32_t minor{9}; +#else + int32_t major{}; + int32_t minor{}; + int32_t deviceIndex{}; + CHECK(cudaGetDevice(&deviceIndex)); + CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); + CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); +#endif + return ((major << 8) | minor); +} + +inline bool isSMSafe() +{ + const int32_t smVersion = getSMVersion(); + return smVersion == 0x0700 || smVersion == 0x0705 || smVersion == 0x0800 || smVersion == 0x0806 + || smVersion == 0x0807; +} + +inline int32_t calculateSoftmax(float* const prob, int32_t const numDigits) +{ + SAFE_ASSERT(prob != nullptr); + SAFE_ASSERT(numDigits == 10); + float sum{0.0F}; + std::transform(prob, prob + numDigits, prob, [&sum](float v) -> float { + sum += exp(v); + return exp(v); + }); + + SAFE_ASSERT(sum != 0.0F); + std::transform(prob, prob + numDigits, prob, [sum](float v) -> float { return v / sum; }); + int32_t idx = std::max_element(prob, prob + numDigits) - prob; + return idx; +} + +//! +//! \class TrtCudaGraphSafe +//! \brief Managed CUDA graph +//! +class TrtCudaGraphSafe +{ +public: + explicit TrtCudaGraphSafe() = default; + + TrtCudaGraphSafe(const TrtCudaGraphSafe&) = delete; + + TrtCudaGraphSafe& operator=(const TrtCudaGraphSafe&) = delete; + + TrtCudaGraphSafe(TrtCudaGraphSafe&&) = delete; + + TrtCudaGraphSafe& operator=(TrtCudaGraphSafe&&) = delete; + + ~TrtCudaGraphSafe() + { + if (mGraphExec) + { + cudaGraphExecDestroy(mGraphExec); + } + } + + void beginCapture(cudaStream_t& stream) + { + // cudaStreamCaptureModeGlobal is the only allowed mode in SAFE CUDA + CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + } + + bool launch(cudaStream_t& stream) + { + return cudaGraphLaunch(mGraphExec, stream) == cudaSuccess; + } + + void endCapture(cudaStream_t& stream) + { + CHECK(cudaStreamEndCapture(stream, &mGraph)); + CHECK(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); + CHECK(cudaGraphDestroy(mGraph)); + } + + void endCaptureOnError(cudaStream_t& stream) + { + // There are two possibilities why stream capture would fail: + // (1) stream is in cudaErrorStreamCaptureInvalidated state. + // (2) TRT reports a failure. + // In case (1), the returning mGraph should be nullptr. + // In case (2), the returning mGraph is not nullptr, but it should not be used. + const auto ret = cudaStreamEndCapture(stream, &mGraph); + if (ret == cudaErrorStreamCaptureInvalidated) + { + SAFE_ASSERT(mGraph == nullptr); + } + else + { + SAFE_ASSERT(ret == cudaSuccess); + SAFE_ASSERT(mGraph != nullptr); + CHECK(cudaGraphDestroy(mGraph)); + mGraph = nullptr; + } + // Clean up any CUDA error. + cudaGetLastError(); + sample::gLogError << "The CUDA graph capture on the stream has failed." << std::endl; + } + +private: + cudaGraph_t mGraph{}; + cudaGraphExec_t mGraphExec{}; +}; + +inline void safeLoadLibrary(const std::string& path) +{ +#ifdef _MSC_VER + void* handle = LoadLibraryA(path.c_str()); +#else + int32_t flags{RTLD_LAZY}; + void* handle = dlopen(path.c_str(), flags); +#endif + if (handle == nullptr) + { +#ifdef _MSC_VER + sample::gLogError << "Could not load plugin library: " << path << std::endl; +#else + sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl; +#endif + } +} + +inline std::vector safeSplitString(std::string str, char delimiter = ',') +{ + std::vector splitVect; + std::stringstream ss(str); + std::string substr; + + while (ss.good()) + { + getline(ss, substr, delimiter); + splitVect.emplace_back(std::move(substr)); + } + return splitVect; +} + } // namespace samplesCommon +namespace safetyCompliance +{ +inline void initSafeCuda() +{ + // According to CUDA initialization in NVIDIA CUDA SAFETY API REFERENCE FOR DRIVE OS + // We will need to do the following in order + // 1. Initialize the calling thread with CUDA specific information (Call any CUDA RT API identified as init) + // 2. Query/Configure and choose the desired CUDA device + // 3. CUDA context initialization. (Call cudaDeviceGetLimit or cuCtxCreate) + size_t stackSizeLimit = 0; + int32_t deviceIndex = 0; + CHECK(cudaGetDevice(&deviceIndex)); + CHECK(cudaDeviceGetLimit(&stackSizeLimit, cudaLimitStackSize)); +#if IS_QNX_SAFE + CHECK(cudaSafeExSelectAPIMode(cudaSafeExAPIModeAsilB)); +#endif // IS_QNX_SAFE +} + +inline void setPromgrAbility() +{ +#if IS_QNX_SAFE + // Comply with DEEPLRN_RES_117 on QNX-safe by dropping PROCMGR_AID_MEM_PHYS ability and locking out any further + // changes + procmgr_ability( + 0, PROCMGR_ADN_NONROOT | PROCMGR_AOP_DENY | PROCMGR_AOP_LOCK | PROCMGR_AID_MEM_PHYS, PROCMGR_AID_EOL); +#endif // IS_QNX_SAFE +} + +} // namespace safetyCompliance + #endif // TENSORRT_SAFE_COMMON_H diff --git a/src/Detector/tensorrt_yolo/common/sampleConfig.h b/src/Detector/tensorrt_yolo/common/sampleConfig.h index 53a78331..801a268a 100644 --- a/src/Detector/tensorrt_yolo/common/sampleConfig.h +++ b/src/Detector/tensorrt_yolo/common/sampleConfig.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -55,9 +56,9 @@ class SampleConfig : public nvonnxparser::IOnnxConfig bool mDebugBuilder{false}; InputDataFormat mInputDataFormat{InputDataFormat::kASCII}; uint64_t mTopK{0}; - float mFailurePercentage{-1.0f}; - float mTolerance{0.0f}; - float mAbsTolerance{1e-5f}; + float mFailurePercentage{-1.0F}; + float mTolerance{0.0F}; + float mAbsTolerance{1e-5F}; public: SampleConfig() @@ -70,8 +71,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig #endif } -protected: - ~SampleConfig() + ~SampleConfig() override { #ifdef ONNX_DEBUG if (isDebug()) @@ -82,12 +82,12 @@ class SampleConfig : public nvonnxparser::IOnnxConfig } public: - void setModelDtype(const nvinfer1::DataType mdt) noexcept + void setModelDtype(const nvinfer1::DataType mdt) noexcept override { mModelDtype = mdt; } - nvinfer1::DataType getModelDtype() const noexcept + nvinfer1::DataType getModelDtype() const noexcept override { return mModelDtype; } @@ -102,28 +102,28 @@ class SampleConfig : public nvonnxparser::IOnnxConfig mTF32 = enabled; } - const char* getModelFileName() const noexcept + const char* getModelFileName() const noexcept override { return mModelFilename.c_str(); } - void setModelFileName(const char* onnxFilename) noexcept + void setModelFileName(const char* onnxFilename) noexcept override { mModelFilename = std::string(onnxFilename); } - Verbosity getVerbosityLevel() const noexcept + Verbosity getVerbosityLevel() const noexcept override { return mVerbosity; } - void addVerbosity() noexcept + void addVerbosity() noexcept override { ++mVerbosity; } - void reduceVerbosity() noexcept + void reduceVerbosity() noexcept override { --mVerbosity; } - virtual void setVerbosityLevel(Verbosity v) noexcept + void setVerbosityLevel(Verbosity v) noexcept override { mVerbosity = v; } @@ -135,19 +135,19 @@ class SampleConfig : public nvonnxparser::IOnnxConfig { mEngineFilename = std::string(engineFilename); } - const char* getTextFileName() const noexcept + const char* getTextFileName() const noexcept override { return mTextFilename.c_str(); } - void setTextFileName(const char* textFilename) noexcept + void setTextFileName(const char* textFilename) noexcept override { mTextFilename = std::string(textFilename); } - const char* getFullTextFileName() const noexcept + const char* getFullTextFileName() const noexcept override { return mFullTextFilename.c_str(); } - void setFullTextFileName(const char* fullTextFilename) noexcept + void setFullTextFileName(const char* fullTextFilename) noexcept override { mFullTextFilename = std::string(fullTextFilename); } @@ -161,12 +161,12 @@ class SampleConfig : public nvonnxparser::IOnnxConfig return mLabel; } //!< get the Label - bool getPrintLayerInfo() const noexcept + bool getPrintLayerInfo() const noexcept override { return mPrintLayercInfo; } - void setPrintLayerInfo(bool b) noexcept + void setPrintLayerInfo(bool b) noexcept override { mPrintLayercInfo = b; } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() @@ -312,7 +312,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig { return mTimingCacheFilename.c_str(); } - + void setTimingCacheFileName(const char* timingCacheFilename) noexcept { mTimingCacheFilename = std::string(timingCacheFilename); @@ -326,12 +326,6 @@ class SampleConfig : public nvonnxparser::IOnnxConfig return false; #endif } - - void destroy() noexcept - { - delete this; - } - }; // class SampleConfig #endif diff --git a/src/Detector/tensorrt_yolo/common/sampleDevice.cpp b/src/Detector/tensorrt_yolo/common/sampleDevice.cpp new file mode 100644 index 00000000..7964aeb5 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleDevice.cpp @@ -0,0 +1,133 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sampleDevice.h" + +#include + +namespace sample +{ + +void cudaCheck(cudaError_t ret, std::ostream& err) +{ + if (ret != cudaSuccess) + { + err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; + exit(EXIT_FAILURE); + } +} + +// Construct GPU UUID string in the same format as nvidia-smi does. +std::string getUuidString(cudaUUID_t uuid) +{ + constexpr int32_t kUUID_SIZE = sizeof(cudaUUID_t); + static_assert(kUUID_SIZE == 16, "Unexpected size for cudaUUID_t!"); + + std::ostringstream ss; + std::vector const splits = {0, 4, 6, 8, 10, kUUID_SIZE}; + + ss << "GPU" << std::hex << std::setfill('0'); + for (int32_t splitIdx = 0; splitIdx < static_cast(splits.size()) - 1; ++splitIdx) + { + ss << "-"; + for (int32_t byteIdx = splits[splitIdx]; byteIdx < splits[splitIdx + 1]; ++byteIdx) + { + ss << std::setw(2) << +static_cast(uuid.bytes[byteIdx]); + } + } + return ss.str(); +} + +void setCudaDevice(int32_t device, std::ostream& os) +{ +#if !TRT_WINML + os << "=== Device Information ===" << std::endl; + + // Get the number of visible GPUs. + int32_t nbDevices{-1}; + cudaCheck(cudaGetDeviceCount(&nbDevices)); + + if (nbDevices <= 0) + { + os << "Cannot find any available devices (GPUs)!" << std::endl; + exit(EXIT_FAILURE); + } + + // Print out the GPU name and PCIe bus ID of each GPU. + os << "Available Devices: " << std::endl; + cudaDeviceProp properties; + for (int32_t deviceIdx = 0; deviceIdx < nbDevices; ++deviceIdx) + { + cudaDeviceProp tempProperties; + cudaCheck(cudaGetDeviceProperties(&tempProperties, deviceIdx)); + + // clang-format off + os << " Device " << deviceIdx << ": \"" << tempProperties.name << "\" UUID: " + << getUuidString(tempProperties.uuid) << std::endl; + // clang-format on + + // Record the properties of the desired GPU. + if (deviceIdx == device) + { + properties = tempProperties; + } + } + + // Exit with error if the requested device ID does not exist. + if (device < 0 || device >= nbDevices) + { + os << "Cannot find device ID " << device << "!" << std::endl; + exit(EXIT_FAILURE); + } + + // Set to the corresponding GPU. + cudaCheck(cudaSetDevice(device)); + + // clang-format off + os << "Selected Device: " << properties.name << std::endl; + os << "Selected Device ID: " << device << std::endl; + os << "Selected Device UUID: " << getUuidString(properties.uuid) << std::endl; + os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; + os << "SMs: " << properties.multiProcessorCount << std::endl; + os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; + os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; + os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" + << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; + os << "Application Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; + os << "Application Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; + os << std::endl; + os << "Note: The application clock rates do not reflect the actual clock rates that the GPU is " + << "currently running at." << std::endl; + // clang-format on +#endif +} + +int32_t getCudaDriverVersion() +{ + int32_t version{-1}; + cudaCheck(cudaDriverGetVersion(&version)); + return version; +} + +int32_t getCudaRuntimeVersion() +{ + int32_t version{-1}; + cudaCheck(cudaRuntimeGetVersion(&version)); + return version; +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleDevice.h b/src/Detector/tensorrt_yolo/common/sampleDevice.h index 2053ac7c..986dccb4 100644 --- a/src/Detector/tensorrt_yolo/common/sampleDevice.h +++ b/src/Detector/tensorrt_yolo/common/sampleDevice.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -23,17 +24,13 @@ #include #include +#include "sampleUtils.h" + namespace sample { -inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) -{ - if (ret != cudaSuccess) - { - err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; - abort(); - } -} +//! Check if the CUDA return status shows any error. If so, exit the program immediately. +void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr); class TrtCudaEvent; @@ -238,16 +235,18 @@ class TrtCudaBuffer TrtCudaBuffer(TrtCudaBuffer&& rhs) { - reset(rhs.mPtr); + reset(rhs.mPtr, rhs.mSize); rhs.mPtr = nullptr; + rhs.mSize = 0; } TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) { if (this != &rhs) { - reset(rhs.mPtr); + reset(rhs.mPtr, rhs.mSize); rhs.mPtr = nullptr; + rhs.mSize = 0; } return *this; } @@ -260,21 +259,24 @@ class TrtCudaBuffer TrtCudaBuffer(size_t size) { A()(&mPtr, size); + mSize = size; } void allocate(size_t size) { reset(); A()(&mPtr, size); + mSize = size; } - void reset(void* ptr = nullptr) + void reset(void* ptr = nullptr, size_t size = 0) { if (mPtr) { D()(mPtr); } mPtr = ptr; + mSize = size; } void* get() const @@ -282,8 +284,14 @@ class TrtCudaBuffer return mPtr; } + size_t getSize() const + { + return mSize; + } + private: void* mPtr{nullptr}; + size_t mSize{0}; }; struct DeviceAllocator @@ -383,39 +391,39 @@ class IMirroredBuffer }; // class IMirroredBuffer //! -//! Class to have a seperate memory buffer for discrete device and host allocations. +//! Class to have a separate memory buffer for discrete device and host allocations. //! class DiscreteMirroredBuffer : public IMirroredBuffer { public: - void allocate(size_t size) + void allocate(size_t size) override { mSize = size; mHostBuffer.allocate(size); mDeviceBuffer.allocate(size); } - void* getDeviceBuffer() const + void* getDeviceBuffer() const override { return mDeviceBuffer.get(); } - void* getHostBuffer() const + void* getHostBuffer() const override { return mHostBuffer.get(); } - void hostToDevice(TrtCudaStream& stream) + void hostToDevice(TrtCudaStream& stream) override { cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); } - void deviceToHost(TrtCudaStream& stream) + void deviceToHost(TrtCudaStream& stream) override { cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); } - size_t getSize() const + size_t getSize() const override { return mSize; } @@ -432,33 +440,33 @@ class DiscreteMirroredBuffer : public IMirroredBuffer class UnifiedMirroredBuffer : public IMirroredBuffer { public: - void allocate(size_t size) + void allocate(size_t size) override { mSize = size; mBuffer.allocate(size); } - void* getDeviceBuffer() const + void* getDeviceBuffer() const override { return mBuffer.get(); } - void* getHostBuffer() const + void* getHostBuffer() const override { return mBuffer.get(); } - void hostToDevice(TrtCudaStream& /*stream*/) + void hostToDevice(TrtCudaStream& stream) override { // Does nothing since we are using unified memory. } - void deviceToHost(TrtCudaStream& /*stream*/) + void deviceToHost(TrtCudaStream& stream) override { // Does nothing since we are using unified memory. } - size_t getSize() const + size_t getSize() const override { return mSize; } @@ -468,26 +476,70 @@ class UnifiedMirroredBuffer : public IMirroredBuffer TrtManagedBuffer mBuffer; }; // class UnifiedMirroredBuffer -inline void setCudaDevice(int device, std::ostream& os) +//! +//! Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is +//! not possible. +//! +class OutputAllocator : public nvinfer1::IOutputAllocator { - cudaCheck(cudaSetDevice(device)); - - cudaDeviceProp properties; - cudaCheck(cudaGetDeviceProperties(&properties, device)); - -// clang-format off - os << "=== Device Information ===" << std::endl; - os << "Selected Device: " << properties.name << std::endl; - os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; - os << "SMs: " << properties.multiProcessorCount << std::endl; - os << "Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; - os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; - os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; - os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" - << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; - os << "Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; - // clang-format on -} +public: + OutputAllocator(IMirroredBuffer* buffer) + : mBuffer(buffer) + { + } + + void* reallocateOutput( + char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override + { + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + size = std::max(size, static_cast(1)); + if (size > mSize) + { + mBuffer->allocate(roundUp(size, alignment)); + mSize = size; + } + return mBuffer->getDeviceBuffer(); + } + + //! IMirroredBuffer does not implement Async allocation, hence this is just a wrap around + void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, + cudaStream_t /*stream*/) noexcept override + { + return reallocateOutput(tensorName, currentMemory, size, alignment); + } + + void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override + { + mFinalDims = dims; + } + + IMirroredBuffer* getBuffer() + { + return mBuffer.get(); + } + + nvinfer1::Dims getFinalDims() + { + return mFinalDims; + } + + ~OutputAllocator() override {} + +private: + std::unique_ptr mBuffer; + uint64_t mSize{}; + nvinfer1::Dims mFinalDims; +}; + +//! Set the GPU to run the inference on. +void setCudaDevice(int32_t device, std::ostream& os); + +//! Get the CUDA version of the current CUDA driver. +int32_t getCudaDriverVersion(); + +//! Get the CUDA version of the current CUDA runtime. +int32_t getCudaRuntimeVersion(); } // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ new file mode 100644 index 00000000..8ada0526 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ @@ -0,0 +1,1688 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxParser.h" + +#include "ErrorRecorder.h" +#include "common.h" +#include "half.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleEngines.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +using namespace nvinfer1; + +namespace sample +{ + +namespace +{ + +std::map readScalesFromCalibrationCache(std::string const& calibrationFile) +{ + std::map tensorScales; + std::ifstream cache{calibrationFile}; + if (!cache.is_open()) + { + sample::gLogError << "[TRT] Can not open provided calibration cache file" << std::endl; + return tensorScales; + } + std::string line; + while (std::getline(cache, line)) + { + auto colonPos = line.find_last_of(':'); + if (colonPos != std::string::npos) + { + // Scales should be stored in calibration cache as 32-bit floating numbers encoded as 32-bit integers + int32_t scalesAsInt = std::stoi(line.substr(colonPos + 2, 8), nullptr, 16); + auto const tensorName = line.substr(0, colonPos); + tensorScales[tensorName] = *reinterpret_cast(&scalesAsInt); + } + } + cache.close(); + return tensorScales; +} +} // namespace + +nvinfer1::ICudaEngine* LazilyDeserializedEngine::get() +{ + SMP_RETVAL_IF_FALSE( + !mIsSafe, "Safe mode is enabled, but trying to get standard engine!", nullptr, sample::gLogError); + + if (mEngine == nullptr) + { + SMP_RETVAL_IF_FALSE(getFileReader().isOpen() || !getBlob().empty(), "Engine is empty. Nothing to deserialize!", + nullptr, sample::gLogError); + + using time_point = std::chrono::time_point; + using duration = std::chrono::duration; + time_point const deserializeStartTime{std::chrono::high_resolution_clock::now()}; + + if (mLeanDLLPath.empty()) + { + mRuntime.reset(createRuntime()); + } + else + { + mParentRuntime.reset(createRuntime()); + ASSERT(mParentRuntime.get() != nullptr); + + mRuntime.reset(mParentRuntime->loadRuntime(mLeanDLLPath.c_str())); + } + ASSERT(mRuntime.get() != nullptr); + + if (mVersionCompatible) + { + // Application needs to opt into allowing deserialization of engines with embedded lean runtime. + mRuntime->setEngineHostCodeAllowed(true); + } + + if (!mTempdir.empty()) + { + mRuntime->setTemporaryDirectory(mTempdir.c_str()); + } + + mRuntime->setTempfileControlFlags(mTempfileControls); + + SMP_RETVAL_IF_FALSE(mRuntime != nullptr, "runtime creation failed", nullptr, sample::gLogError); + if (mDLACore != -1) + { + mRuntime->setDLACore(mDLACore); + } + mRuntime->setErrorRecorder(&gRecorder); +#if !TRT_WINML + for (auto const& pluginPath : mDynamicPlugins) + { + mRuntime->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } +#endif + + if (getFileReader().isOpen()) + { + mEngine.reset(mRuntime->deserializeCudaEngine(getFileReader())); + } + else + { + auto const& engineBlob = getBlob(); + mEngine.reset(mRuntime->deserializeCudaEngine(engineBlob.data, engineBlob.size)); + } + SMP_RETVAL_IF_FALSE(mEngine != nullptr, "Engine deserialization failed", nullptr, sample::gLogError); + + time_point const deserializeEndTime{std::chrono::high_resolution_clock::now()}; + sample::gLogInfo << "Engine deserialized in " << duration(deserializeEndTime - deserializeStartTime).count() + << " sec." << std::endl; + } + + return mEngine.get(); +} + +nvinfer1::ICudaEngine* LazilyDeserializedEngine::release() +{ + return mEngine.release(); +} + +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector const& inputFormats, + std::vector const& outputFormats, std::string const& calibrationFile) +{ + auto const tensorScales = readScalesFromCalibrationCache(calibrationFile); + bool const broadcastInputFormats = broadcastIOFormats(inputFormats, network.getNbInputs()); + for (int32_t i = 0, n = network.getNbInputs(); i < n; ++i) + { + int32_t formatIdx = broadcastInputFormats ? 0 : i; + if (!inputFormats.empty() && inputFormats[formatIdx].first == DataType::kINT8) + { + auto* input = network.getInput(i); + auto const calibScale = tensorScales.at(input->getName()); + input->setDynamicRange(-127 * calibScale, 127 * calibScale); + } + } + bool const broadcastOutputFormats = broadcastIOFormats(outputFormats, network.getNbOutputs()); + for (int32_t i = 0, n = network.getNbOutputs(); i < n; ++i) + { + int32_t formatIdx = broadcastOutputFormats ? 0 : i; + if (!outputFormats.empty() && outputFormats[formatIdx].first == DataType::kINT8) + { + auto* output = network.getOutput(i); + auto const calibScale = tensorScales.at(output->getName()); + output->setDynamicRange(-127 * calibScale, 127 * calibScale); + } + } +} + +//! +//! \brief Generate a network definition for a given model +//! +//! \param[in] model Model options for this network +//! \param[in,out] network Network storing the parsed results +//! \param[in,out] err Error stream +//! \param[out] vcPluginLibrariesUsed If not nullptr, will be populated with paths to VC plugin libraries required by +//! the parsed network. +//! +//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid +//! parser (the returned parser converts to false if tested) +//! +//! Constant input dimensions in the model must not be changed in the corresponding +//! network definition, because its correctness may rely on the constants. +//! +//! \see Parser::operator bool() +//! +Parser modelToNetwork(ModelOptions const& model, BuildOptions const& build, nvinfer1::INetworkDefinition& network, + std::ostream& err, std::vector* vcPluginLibrariesUsed) +{ + sample::gLogInfo << "Start parsing network model." << std::endl; + auto const tBegin = std::chrono::high_resolution_clock::now(); + + Parser parser; + switch (model.baseModel.format) + { + case ModelFormat::kONNX: + { + using namespace nvonnxparser; + parser.onnxParser.reset(createONNXParser(network)); + ASSERT(parser.onnxParser != nullptr); +#if !TRT_WINML + // kNATIVE_INSTANCENORM is ON by default in the parser and must be cleared to use the plugin implementation. + if (build.pluginInstanceNorm) + { + parser.onnxParser->clearFlag(OnnxParserFlag::kNATIVE_INSTANCENORM); + } +#endif + if (!parser.onnxParser->parseFromFile( + model.baseModel.model.c_str(), static_cast(sample::gLogger.getReportableSeverity()))) + { + err << "Failed to parse onnx file" << std::endl; + parser.onnxParser.reset(); + } +#if !TRT_WINML + if (vcPluginLibrariesUsed && parser.onnxParser.get()) + { + int64_t nbPluginLibs; + char const* const* pluginLibArray = parser.onnxParser->getUsedVCPluginLibraries(nbPluginLibs); + if (nbPluginLibs >= 0) + { + vcPluginLibrariesUsed->reserve(nbPluginLibs); + for (int64_t i = 0; i < nbPluginLibs; ++i) + { + sample::gLogInfo << "Using VC plugin library " << pluginLibArray[i] << std::endl; + vcPluginLibrariesUsed->emplace_back(std::string{pluginLibArray[i]}); + } + } + else + { + sample::gLogWarning << "Failure to query VC plugin libraries required by parsed ONNX network" + << std::endl; + } + } +#endif + break; + } + case ModelFormat::kANY: break; + } + + auto const tEnd = std::chrono::high_resolution_clock::now(); + float const parseTime = std::chrono::duration(tEnd - tBegin).count(); + + sample::gLogInfo << "Finished parsing network model. Parse time: " << parseTime << std::endl; + return parser; +} + +namespace +{ + +class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + RndInt8Calibrator(int32_t batches, std::vector& elemCount, std::string const& cacheFile, + nvinfer1::INetworkDefinition const& network, std::ostream& err); + + ~RndInt8Calibrator() override + { + for (auto& elem : mInputDeviceBuffers) + { + cudaCheck(cudaFree(elem.second), mErr); + } + } + + bool getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept override; + + int32_t getBatchSize() const noexcept override + { + return 1; + } + + const void* readCalibrationCache(size_t& length) noexcept override; + + void writeCalibrationCache(void const*, size_t) noexcept override {} + +private: + int32_t mBatches{}; + int32_t mCurrentBatch{}; + std::string mCacheFile; + std::map mInputDeviceBuffers; + std::vector mCalibrationCache; + std::ostream& mErr; +}; + +RndInt8Calibrator::RndInt8Calibrator(int32_t batches, std::vector& elemCount, std::string const& cacheFile, + INetworkDefinition const& network, std::ostream& err) + : mBatches(batches) + , mCurrentBatch(0) + , mCacheFile(cacheFile) + , mErr(err) +{ + std::ifstream tryCache(cacheFile, std::ios::binary); + if (tryCache.good()) + { + return; + } + + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0F, 1.0F); + auto gen = [&generator, &distribution]() { return distribution(generator); }; + + for (int32_t i = 0; i < network.getNbInputs(); i++) + { + auto* input = network.getInput(i); + std::vector rnd_data(elemCount[i]); + std::generate_n(rnd_data.begin(), elemCount[i], gen); + + void* data; + cudaCheck(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr); + cudaCheck(cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), cudaMemcpyHostToDevice), mErr); + + mInputDeviceBuffers.insert(std::make_pair(input->getName(), data)); + } +} + +bool RndInt8Calibrator::getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept +{ + if (mCurrentBatch >= mBatches) + { + return false; + } + + for (int32_t i = 0; i < nbBindings; ++i) + { + bindings[i] = mInputDeviceBuffers[names[i]]; + } + + ++mCurrentBatch; + + return true; +} + +const void* RndInt8Calibrator::readCalibrationCache(size_t& length) noexcept +{ + mCalibrationCache.clear(); + std::ifstream input(mCacheFile, std::ios::binary); + input >> std::noskipws; + if (input.good()) + { + std::copy( + std::istream_iterator(input), std::istream_iterator(), std::back_inserter(mCalibrationCache)); + } + + length = mCalibrationCache.size(); + return !mCalibrationCache.empty() ? mCalibrationCache.data() : nullptr; +} + +bool setTensorDynamicRange(INetworkDefinition const& network, float inRange = 2.0F, float outRange = 4.0F) +{ + // Ensure that all layer inputs have a dynamic range. + for (int32_t l = 0; l < network.getNbLayers(); l++) + { + auto* layer = network.getLayer(l); + for (int32_t i = 0; i < layer->getNbInputs(); i++) + { + ITensor* input{layer->getInput(i)}; + // Optional inputs are nullptr here and are from RNN layers. + if (input && !input->dynamicRangeIsSet()) + { + // Concat should propagate dynamic range from outputs to inputs to avoid + // Re-quantization during the concatenation + auto dynRange = (layer->getType() == LayerType::kCONCATENATION) ? outRange : inRange; + if (!input->setDynamicRange(-dynRange, dynRange)) + { + return false; + } + } + } + for (int32_t o = 0; o < layer->getNbOutputs(); o++) + { + ITensor* output{layer->getOutput(o)}; + // Optional outputs are nullptr here and are from RNN layers. + if (output && !output->dynamicRangeIsSet()) + { + // Pooling must have the same input and output dynamic range. + if (layer->getType() == LayerType::kPOOLING) + { + if (!output->setDynamicRange(-inRange, inRange)) + { + return false; + } + } + else + { + if (!output->setDynamicRange(-outRange, outRange)) + { + return false; + } + } + } + } + } + return true; +} + +bool isNonActivationType(nvinfer1::DataType const type) +{ + return type == nvinfer1::DataType::kINT32 || type == nvinfer1::DataType::kINT64 || type == nvinfer1::DataType::kBOOL + || type == nvinfer1::DataType::kUINT8; +} + +void setLayerPrecisions(INetworkDefinition& network, LayerPrecisions const& layerPrecisions) +{ + bool hasLayerPrecisionSkipped{false}; + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + auto exactMatch = layerPrecisions.find(layerName); + auto plausibleMatch = findPlausible(layerPrecisions, layerName); + if (exactMatch != layerPrecisions.end()) + { + sample::gLogInfo << "Set layer " << layerName << " to precision " << exactMatch->second << std::endl; + layer->setPrecision(exactMatch->second); + } + else if (plausibleMatch != layerPrecisions.end()) + { + if (isNonActivationType(layer->getPrecision())) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because the " + << " default layer precision is of non-activation type." << std::endl; + continue; + } + if (layer->getType() == nvinfer1::LayerType::kCONSTANT + && (isNonActivationType(static_cast(layer)->getWeights().type))) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " + << "constant layer has weights of non-activation type." << std::endl; + continue; + } + if (layer->getNbInputs() >= 1 && layer->getInput(0)->isShapeTensor()) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this layer " + << "operates on a shape tensor." << std::endl; + continue; + } + if (layer->getNbInputs() >= 1 && isNonActivationType(layer->getInput(0)->getType()) + && layer->getNbOutputs() >= 1 && isNonActivationType(layer->getOutput(0)->getType())) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " + << "layer has input and output of non-activation type." << std::endl; + continue; + } + // All heuristics passed. Set the layer precision. + sample::gLogInfo << "Set layer " << layerName << " to precision " << plausibleMatch->second << std::endl; + layer->setPrecision(plausibleMatch->second); + } + } + + if (hasLayerPrecisionSkipped) + { + sample::gLogInfo << "Skipped setting precisions for some layers. Check verbose logs for more details." + << std::endl; + } +} + +void setLayerOutputTypes(INetworkDefinition& network, LayerOutputTypes const& layerOutputTypes) +{ + bool const hasGlobalOutputType{layerOutputTypes.find("*") != layerOutputTypes.end()}; + auto const globalOutputType = hasGlobalOutputType ? layerOutputTypes.at("*").at(0) : nvinfer1::DataType::kFLOAT; + bool hasLayerOutputTypeSkipped{false}; + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + auto const nbOutputs = layer->getNbOutputs(); + auto exactMatch = layerOutputTypes.find(layerName); + auto plausibleMatch = findPlausible(layerOutputTypes, layerName); + if (exactMatch != layerOutputTypes.end()) + { + auto const& outputTypes = exactMatch->second; + bool const isBroadcast = (outputTypes.size() == 1); + if (!isBroadcast && static_cast(outputTypes.size()) != nbOutputs) + { + sample::gLogError << "Layer " << layerName << " has " << nbOutputs << " outputs but " + << outputTypes.size() << " output types are given in --layerOutputTypes flag." + << std::endl; + throw std::invalid_argument("Invalid --layerOutputTypes flag."); + } + for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) + { + auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx); + sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType + << std::endl; + layer->setOutputType(outputIdx, outputType); + } + } + else if (plausibleMatch != layerOutputTypes.end()) + { + auto const& outputTypes = plausibleMatch->second; + bool const isBroadcast = (outputTypes.size() == 1); + + // We should not set the layer output types if its default precision is INT32 or Bool. + if (layer->getPrecision() == nvinfer1::DataType::kINT32 + || layer->getPrecision() == nvinfer1::DataType::kBOOL) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because the " + << " default layer precision is INT32 or Bool." << std::endl; + continue; + } + // We should not set the constant layer output types if its weights are in INT32. + if (layer->getType() == nvinfer1::LayerType::kCONSTANT + && static_cast(layer)->getWeights().type == nvinfer1::DataType::kINT32) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because this " + << "constant layer has INT32 weights." << std::endl; + continue; + } + for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) + { + // We should not set the output type if the output is a shape tensor. + if (layer->getOutput(0)->isShapeTensor()) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output type for output " << outputIdx << " of layer " + << layerName << " because it is a shape tensor." << std::endl; + continue; + } + + auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx); + sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType + << std::endl; + layer->setOutputType(outputIdx, globalOutputType); + } + } + } + + if (hasLayerOutputTypeSkipped) + { + sample::gLogInfo << "Skipped setting output types for some layers. Check verbose logs for more details." + << std::endl; + } +} + +void setLayerDeviceTypes( + INetworkDefinition const& network, IBuilderConfig& config, LayerDeviceTypes const& layerDeviceTypes) +{ + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + auto match = findPlausible(layerDeviceTypes, layerName); + if (match != layerDeviceTypes.end()) + { + DeviceType const deviceType = match->second; + sample::gLogInfo << "Set layer " << layerName << " to device type " << deviceType << std::endl; + config.setDeviceType(layer, deviceType); + } + } +} + +void markDebugTensors(INetworkDefinition& network, StringSet const& debugTensors) +{ + for (int64_t inputIndex = 0; inputIndex < network.getNbInputs(); ++inputIndex) + { + auto* t = network.getInput(inputIndex); + auto const tensorName = t->getName(); + if (debugTensors.count(tensorName) > 0) + { + network.markDebug(*t); + } + } + for (int64_t layerIndex = 0; layerIndex < network.getNbLayers(); ++layerIndex) + { + auto* layer = network.getLayer(layerIndex); + for (int64_t outputIndex = 0; outputIndex < layer->getNbOutputs(); ++outputIndex) + { + auto* t = layer->getOutput(outputIndex); + auto const tensorName = t->getName(); + if (debugTensors.count(tensorName) > 0) + { + network.markDebug(*t); + } + } + } +} + +void setMemoryPoolLimits(IBuilderConfig& config, BuildOptions const& build) +{ + auto const roundToBytes = [](double const size, bool fromMB = true) { + return static_cast(size * (fromMB ? 1.0_MiB : 1.0_KiB)); + }; + if (build.workspace >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, roundToBytes(build.workspace)); + } + if (build.dlaSRAM >= 0) + { + size_t const sizeInBytes = roundToBytes(build.dlaSRAM); + size_t sizeInPowerOf2{1}; + // Using 2^30 bytes as a loose upper bound to prevent the possibility of overflows and infinite loops. + while (sizeInPowerOf2 < 31 && (static_cast(1) << sizeInPowerOf2) <= sizeInBytes) + { + ++sizeInPowerOf2; + } + --sizeInPowerOf2; + if (sizeInPowerOf2 == 30) + { + sample::gLogWarning + << "User-specified DLA managed SRAM size is too large and has been clipped to 2^30 bytes. " + << "Please make sure that this is the intended managed SRAM size." << std::endl; + } + config.setMemoryPoolLimit(MemoryPoolType::kDLA_MANAGED_SRAM, static_cast(1) << sizeInPowerOf2); + } + if (build.dlaLocalDRAM >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_LOCAL_DRAM, roundToBytes(build.dlaLocalDRAM)); + } + if (build.dlaGlobalDRAM >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_GLOBAL_DRAM, roundToBytes(build.dlaGlobalDRAM)); + } + if (build.tacticSharedMem >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kTACTIC_SHARED_MEMORY, roundToBytes(build.tacticSharedMem, false)); + } +} + +void setPreviewFeatures(IBuilderConfig& config, BuildOptions const& build) +{ + auto const setFlag = [&](PreviewFeature feat) { + int32_t featVal = static_cast(feat); + if (build.previewFeatures.find(featVal) != build.previewFeatures.end()) + { + config.setPreviewFeature(feat, build.previewFeatures.at(featVal)); + } + }; + setFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03); +} + +} // namespace + +bool setupNetworkAndConfig(BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, + INetworkDefinition& network, IBuilderConfig& config, std::unique_ptr& calibrator, + std::ostream& err, std::vector>& sparseWeights) +{ + std::vector profiles{}; + profiles.resize(build.optProfiles.size()); + for (auto& profile : profiles) + { + profile = builder.createOptimizationProfile(); + } + + bool hasDynamicShapes{false}; + + bool broadcastInputFormats = broadcastIOFormats(build.inputFormats, network.getNbInputs()); + + // Check if the provided input tensor names match the input tensors of the engine. + // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. + for (auto const& shapes : build.optProfiles) + { + for (auto const& shape : shapes) + { + bool tensorNameFound{false}; + for (int32_t i = 0; i < network.getNbInputs(); ++i) + { + if (matchStringWithOneWildcard(shape.first, network.getInput(i)->getName())) + { + tensorNameFound = true; + break; + } + } + if (!tensorNameFound) + { + sample::gLogError << "Cannot find input tensor with name \"" << shape.first << "\" in the network " + << "inputs! Please make sure the input tensor names are correct." << std::endl; + return false; + } + } + } + + for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) + { + // Set formats and data types of inputs + auto* input = network.getInput(i); + if (!build.inputFormats.empty()) + { + int32_t inputFormatIndex = broadcastInputFormats ? 0 : i; + input->setType(build.inputFormats[inputFormatIndex].first); + input->setAllowedFormats(build.inputFormats[inputFormatIndex].second); + } + + auto const dims = input->getDimensions(); + auto const isScalar = dims.nbDims == 0; + auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) + || input->isShapeTensor(); + if (isDynamicInput) + { + hasDynamicShapes = true; + for (size_t i = 0; i < build.optProfiles.size(); i++) + { + auto const& optShapes = build.optProfiles[i]; + auto profile = profiles[i]; + auto const tensorName = input->getName(); + auto shape = findPlausible(optShapes, tensorName); + ShapeRange shapes{}; + + // If no shape is provided, set dynamic dimensions to 1. + if (shape == optShapes.end()) + { + constexpr int32_t kDEFAULT_DIMENSION{1}; + std::vector staticDims; + if (input->isShapeTensor()) + { + if (isScalar) + { + staticDims.push_back(1); + } + else + { + staticDims.resize(dims.d[0]); + std::fill(staticDims.begin(), staticDims.end(), kDEFAULT_DIMENSION); + } + } + else + { + staticDims.resize(dims.nbDims); + std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(), + [&](int dimension) { return dimension > 0 ? dimension : kDEFAULT_DIMENSION; }); + } + sample::gLogWarning << "Dynamic dimensions required for input: " << tensorName + << ", but no shapes were provided. Automatically overriding shape to: " + << staticDims << std::endl; + std::fill(shapes.begin(), shapes.end(), staticDims); + } + else + { + shapes = shape->second; + } + + std::vector profileDims{}; + if (input->isShapeTensor()) + { + profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kMIN, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values MIN", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kOPT, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values OPT", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kMAX, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values MAX", false, err); + sample::gLogInfo << "Set input shape tensor " << tensorName << " for optimization profile " << i + << " to:" + << " MIN=" << shapes[static_cast(OptProfileSelector::kMIN)] + << " OPT=" << shapes[static_cast(OptProfileSelector::kOPT)] + << " MAX=" << shapes[static_cast(OptProfileSelector::kMAX)] << std::endl; + } + else + { + profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(tensorName, OptProfileSelector::kMIN, toDims(profileDims)), + "Error in set dimensions to profile MIN", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(tensorName, OptProfileSelector::kOPT, toDims(profileDims)), + "Error in set dimensions to profile OPT", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(tensorName, OptProfileSelector::kMAX, toDims(profileDims)), + "Error in set dimensions to profile MAX", false, err); + sample::gLogInfo << "Set shape of input tensor " << tensorName << " for optimization profile " << i + << " to:" + << " MIN=" << shapes[static_cast(OptProfileSelector::kMIN)] + << " OPT=" << shapes[static_cast(OptProfileSelector::kOPT)] + << " MAX=" << shapes[static_cast(OptProfileSelector::kMAX)] << std::endl; + } + } + } + } + + for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) + { + auto* output = network.getOutput(i); + auto const dims = output->getDimensions(); + // A shape tensor output with known static dimensions may have dynamic shape values inside it. + auto const isDynamicOutput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) + || output->isShapeTensor(); + if (isDynamicOutput) + { + hasDynamicShapes = true; + } + } + + if (!hasDynamicShapes && !build.optProfiles[0].empty()) + { + sample::gLogError << "Static model does not take explicit shapes since the shape of inference tensors will be " + "determined by the model itself" + << std::endl; + return false; + } + + if (hasDynamicShapes) + { + for (auto profile : profiles) + { + SMP_RETVAL_IF_FALSE(profile->isValid(), "Required optimization profile is invalid", false, err); + SMP_RETVAL_IF_FALSE( + config.addOptimizationProfile(profile) != -1, "Error in add optimization profile", false, err); + } + } + + bool broadcastOutputFormats = broadcastIOFormats(build.outputFormats, network.getNbOutputs(), false); + + for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) + { + // Set formats and data types of outputs + auto* output = network.getOutput(i); + if (!build.outputFormats.empty()) + { + int32_t outputFormatIndex = broadcastOutputFormats ? 0 : i; + output->setType(build.outputFormats[outputFormatIndex].first); + output->setAllowedFormats(build.outputFormats[outputFormatIndex].second); + } + } + + setMemoryPoolLimits(config, build); + + setPreviewFeatures(config, build); + + if (build.builderOptimizationLevel != defaultBuilderOptimizationLevel) + { + config.setBuilderOptimizationLevel(build.builderOptimizationLevel); + } + + if (build.maxTactics != defaultMaxTactics) + { + config.setMaxNbTactics(build.maxTactics); + } + + if (build.timingCacheMode == TimingCacheMode::kDISABLE) + { + config.setFlag(BuilderFlag::kDISABLE_TIMING_CACHE); + } + + if (build.disableCompilationCache) + { + config.setFlag(BuilderFlag::kDISABLE_COMPILATION_CACHE); + } + + if (build.errorOnTimingCacheMiss) + { + config.setFlag(BuilderFlag::kERROR_ON_TIMING_CACHE_MISS); + } + + if (!build.tf32) + { + config.clearFlag(BuilderFlag::kTF32); + } + + if (build.refittable) + { + config.setFlag(BuilderFlag::kREFIT); + } + + if (build.stripWeights) + { + // The kREFIT_IDENTICAL is enabled by default when kSTRIP_PLAN is on. + config.setFlag(BuilderFlag::kSTRIP_PLAN); + } + + if (build.versionCompatible) + { + config.setFlag(BuilderFlag::kVERSION_COMPATIBLE); + } +#if !TRT_WINML + std::vector pluginPaths; + for (auto const& pluginPath : sys.setPluginsToSerialize) + { + sample::gLogVerbose << "Setting plugin to serialize: " << pluginPath << std::endl; + pluginPaths.push_back(pluginPath.c_str()); + } + if (!pluginPaths.empty()) + { + config.setPluginsToSerialize(pluginPaths.data(), pluginPaths.size()); + } +#endif + if (build.excludeLeanRuntime) + { + config.setFlag(BuilderFlag::kEXCLUDE_LEAN_RUNTIME); + } + + if (build.sparsity != SparsityFlag::kDISABLE) + { + config.setFlag(BuilderFlag::kSPARSE_WEIGHTS); + if (build.sparsity == SparsityFlag::kFORCE) + { + sparsify(network, sparseWeights); + } + } + + config.setProfilingVerbosity(build.profilingVerbosity); + config.setAvgTimingIterations(build.avgTiming); + + if (build.fp16) + { + config.setFlag(BuilderFlag::kFP16); + } + if (build.int8) + { + config.setFlag(BuilderFlag::kINT8); + } + if (build.bf16) + { + config.setFlag(BuilderFlag::kBF16); + } + + SMP_RETVAL_IF_FALSE(!(build.int8 && build.fp8), "FP8 and INT8 precisions have been specified", false, err); + + if (build.fp8) + { + config.setFlag(BuilderFlag::kFP8); + } + + if (build.int4) + { + config.setFlag(BuilderFlag::kINT4); + } + + if (build.int8 && !build.fp16) + { + sample::gLogInfo + << "FP32 and INT8 precisions have been specified - more performance might be enabled by additionally " + "specifying --fp16 or --best" + << std::endl; + } + + auto isInt8 = [](const IOFormat& format) { return format.first == DataType::kINT8; }; + auto int8IO = std::count_if(build.inputFormats.begin(), build.inputFormats.end(), isInt8) + + std::count_if(build.outputFormats.begin(), build.outputFormats.end(), isInt8); + + auto hasQDQLayers = [](INetworkDefinition& network) { + // Determine if our network has QDQ layers. + auto const nbLayers = network.getNbLayers(); + for (int32_t i = 0; i < nbLayers; i++) + { + auto const& layer = network.getLayer(i); + if (layer->getType() == LayerType::kQUANTIZE || layer->getType() == LayerType::kDEQUANTIZE) + { + return true; + } + } + return false; + }; + + if (!hasQDQLayers(network) && (build.int8 || int8IO) && build.calibration.empty()) + { + // Explicitly set int8 scales if no calibrator is provided and if I/O tensors use int8, + // because auto calibration does not support this case. + SMP_RETVAL_IF_FALSE(setTensorDynamicRange(network), "Error in set tensor dynamic range.", false, err); + } + else if (build.int8) + { + if (!hasQDQLayers(network) && int8IO) + { + try + { + // Set dynamic ranges of int8 inputs / outputs to match scales loaded from calibration cache + // TODO http://nvbugs/3262234 Change the network validation so that this workaround can be removed + setTensorScalesFromCalibration(network, build.inputFormats, build.outputFormats, build.calibration); + } + catch (std::exception&) + { + sample::gLogError + << "Int8IO was specified but impossible to read tensor scales from provided calibration cache file" + << std::endl; + return false; + } + } + IOptimizationProfile* profileCalib{nullptr}; + if (!build.shapesCalib.empty()) + { + profileCalib = builder.createOptimizationProfile(); + for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) + { + auto* input = network.getInput(i); + Dims profileDims{}; + auto const tensorName = input->getName(); + auto shape = findPlausible(build.shapesCalib, tensorName); + + if (shape == build.shapesCalib.end()) + { + std::ostringstream msg; + msg << "Calibration profile for tensor " << tensorName << " cannot be found!"; + throw std::invalid_argument(msg.str()); + } + + auto shapesCalib = shape->second; + profileDims = toDims(shapesCalib[static_cast(OptProfileSelector::kOPT)]); + // Here we check only kMIN as all profileDims are the same. + SMP_RETVAL_IF_FALSE(profileCalib->setDimensions(tensorName, OptProfileSelector::kMIN, profileDims), + "Error in set dimensions to calibration profile OPT", false, err); + profileCalib->setDimensions(tensorName, OptProfileSelector::kOPT, profileDims); + profileCalib->setDimensions(tensorName, OptProfileSelector::kMAX, profileDims); + sample::gLogInfo << "Set calibration profile for input tensor " << tensorName << " to " << profileDims + << std::endl; + } + SMP_RETVAL_IF_FALSE(profileCalib->isValid(), "Calibration profile is invalid", false, err); + SMP_RETVAL_IF_FALSE( + config.setCalibrationProfile(profileCalib), "Error in set calibration profile", false, err); + } + + std::vector elemCount{}; + for (int i = 0; i < network.getNbInputs(); i++) + { + auto* input = network.getInput(i); + auto const dims = input->getDimensions(); + auto const isDynamicInput + = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + + if (profileCalib) + { + elemCount.push_back(volume(profileCalib->getDimensions(input->getName(), OptProfileSelector::kOPT))); + } + else if (!profiles.empty() && isDynamicInput) + { + elemCount.push_back( + volume(profiles[build.calibProfile]->getDimensions(input->getName(), OptProfileSelector::kOPT))); + } + else + { + elemCount.push_back(volume(input->getDimensions())); + } + } + + calibrator.reset(new RndInt8Calibrator(1, elemCount, build.calibration, network, err)); + config.setInt8Calibrator(calibrator.get()); + } + + if (build.directIO) + { + config.setFlag(BuilderFlag::kDIRECT_IO); + } + + switch (build.precisionConstraints) + { + case PrecisionConstraints::kNONE: + // It's the default for TensorRT. + break; + case PrecisionConstraints::kOBEY: config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); break; + case PrecisionConstraints::kPREFER: config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); break; + } + + if (!build.layerPrecisions.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) + { + setLayerPrecisions(network, build.layerPrecisions); + } + + if (!build.layerOutputTypes.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) + { + setLayerOutputTypes(network, build.layerOutputTypes); + } + + if (!build.layerDeviceTypes.empty()) + { + setLayerDeviceTypes(network, config, build.layerDeviceTypes); + } + + if (!build.debugTensors.empty()) + { + markDebugTensors(network, build.debugTensors); + } + + if (build.safe && sys.DLACore == -1) + { + config.setEngineCapability(EngineCapability::kSAFETY); + } + + if (build.restricted) + { + config.setFlag(BuilderFlag::kSAFETY_SCOPE); + } + + if (sys.DLACore != -1) + { + if (sys.DLACore < builder.getNbDLACores()) + { + config.setDefaultDeviceType(DeviceType::kDLA); + config.setDLACore(sys.DLACore); + config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + if (build.buildDLAStandalone) + { + config.setEngineCapability(EngineCapability::kDLA_STANDALONE); + } + if (build.allowGPUFallback) + { + config.setFlag(BuilderFlag::kGPU_FALLBACK); + } + else + { + // Reformatting runs on GPU, so avoid I/O reformatting. + config.setFlag(BuilderFlag::kDIRECT_IO); + } + if (!build.int8) + { + config.setFlag(BuilderFlag::kFP16); + } + } + else + { + err << "Cannot create DLA engine, " << sys.DLACore << " not available" << std::endl; + return false; + } + } + + if (build.enabledTactics || build.disabledTactics) + { + TacticSources tacticSources = config.getTacticSources(); + tacticSources |= build.enabledTactics; + tacticSources &= ~build.disabledTactics; + config.setTacticSources(tacticSources); + } + + config.setHardwareCompatibilityLevel(build.hardwareCompatibilityLevel); + config.setRuntimePlatform(build.runtimePlatform); + + if (build.maxAuxStreams != defaultMaxAuxStreams) + { + config.setMaxAuxStreams(build.maxAuxStreams); + } + + if (build.allowWeightStreaming) + { + config.setFlag(BuilderFlag::kWEIGHT_STREAMING); + } + + return true; +} + +//! +//! \brief Create a serialized engine for a network defintion +//! +//! \return Whether the engine creation succeeds or fails. +//! +bool networkToSerializedEngine( + BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, BuildEnvironment& env, std::ostream& err) +{ + std::unique_ptr config{builder.createBuilderConfig()}; + std::unique_ptr calibrator; + std::vector> sparseWeights; + SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", false, err); + SMP_RETVAL_IF_FALSE( + setupNetworkAndConfig(build, sys, builder, *env.network, *config, calibrator, err, sparseWeights), + "Network And Config setup failed", false, err); + + std::unique_ptr timingCache{}; + // Try to load cache from file. Create a fresh cache if the file doesn't exist + if (build.timingCacheMode == TimingCacheMode::kGLOBAL) + { + timingCache + = samplesCommon::buildTimingCacheFromFile(gLogger.getTRTLogger(), *config, build.timingCacheFile, err); + } + + // CUDA stream used for profiling by the builder. + auto profileStream = samplesCommon::makeCudaStream(); + SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", false, err); + config->setProfileStream(*profileStream); + + auto const tBegin = std::chrono::high_resolution_clock::now(); + std::unique_ptr serializedEngine{builder.buildSerializedNetwork(*env.network, *config)}; + SMP_RETVAL_IF_FALSE(serializedEngine != nullptr, "Engine could not be created from network", false, err); + auto const tEnd = std::chrono::high_resolution_clock::now(); + float const buildTime = std::chrono::duration(tEnd - tBegin).count(); + sample::gLogInfo << "Engine built in " << buildTime << " sec." << std::endl; + sample::gLogInfo << "Created engine with size: " << (serializedEngine->size() / 1.0_MiB) << " MiB" << std::endl; + + env.engine.setBlob(serializedEngine); + + if (build.timingCacheMode == TimingCacheMode::kGLOBAL) + { + auto timingCache = config->getTimingCache(); + samplesCommon::updateTimingCacheFile(gLogger.getTRTLogger(), build.timingCacheFile, timingCache, builder); + } + + return true; +} + +//! +//! \brief Parse a given model, create a network and an engine. +//! +bool modelToBuildEnv( + ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err) +{ + env.builder.reset(createBuilder()); + SMP_RETVAL_IF_FALSE(env.builder != nullptr, "Builder creation failed", false, err); + env.builder->setErrorRecorder(&gRecorder); + auto networkFlags = (build.stronglyTyped) + ? 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED) + : 0U; +#if !TRT_WINML + for (auto const& pluginPath : sys.dynamicPlugins) + { + env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } +#endif + env.network.reset(env.builder->createNetworkV2(networkFlags)); + + std::vector vcPluginLibrariesUsed; + SMP_RETVAL_IF_FALSE(env.network != nullptr, "Network creation failed", false, err); + env.parser + = modelToNetwork(model, build, *env.network, err, build.versionCompatible ? &vcPluginLibrariesUsed : nullptr); + SMP_RETVAL_IF_FALSE(env.parser.operator bool(), "Parsing model failed", false, err); + +#if !TRT_WINML + if (build.versionCompatible && !sys.ignoreParsedPluginLibs && !vcPluginLibrariesUsed.empty()) + { + sample::gLogInfo << "The following plugin libraries were identified by the parser as required for a " + "version-compatible engine:" + << std::endl; + for (auto const& lib : vcPluginLibrariesUsed) + { + sample::gLogInfo << " " << lib << std::endl; + } + if (!build.excludeLeanRuntime) + { + sample::gLogInfo << "These libraries will be added to --setPluginsToSerialize since --excludeLeanRuntime " + "was not specified." + << std::endl; + std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), + std::back_inserter(sys.setPluginsToSerialize)); + } + sample::gLogInfo << "These libraries will be added to --dynamicPlugins for use at inference time." << std::endl; + std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), std::back_inserter(sys.dynamicPlugins)); + + // Implicitly-added plugins from ONNX parser should be loaded into plugin registry as well. + for (auto const& pluginPath : vcPluginLibrariesUsed) + { + env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } + + sample::gLogInfo << "Use --ignoreParsedPluginLibs to disable this behavior." << std::endl; + } +#endif + + SMP_RETVAL_IF_FALSE( + networkToSerializedEngine(build, sys, *env.builder, env, err), "Building engine failed", false, err); + return true; +} + +namespace +{ +std::pair, std::vector> getLayerWeightsRolePair(IRefitter& refitter) +{ + // Get number of refittable items. + auto const nbAll = refitter.getAll(0, nullptr, nullptr); + std::vector layerNames(nbAll); + // Allocate buffers for the items and get them. + std::vector weightsRoles(nbAll); + refitter.getAll(nbAll, layerNames.data(), weightsRoles.data()); + std::vector layerNameStrs(nbAll); + std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { + if (name == nullptr) + { + return std::string{}; + } + return std::string{name}; + }); + return {layerNameStrs, weightsRoles}; +} + +std::pair, std::vector> getMissingLayerWeightsRolePair(IRefitter& refitter) +{ + // Get number of refittable items. + auto const nbMissing = refitter.getMissing(0, nullptr, nullptr); + std::vector layerNames(nbMissing); + // Allocate buffers for the items and get them. + std::vector weightsRoles(nbMissing); + refitter.getMissing(nbMissing, layerNames.data(), weightsRoles.data()); + std::vector layerNameStrs(nbMissing); + std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { + if (name == nullptr) + { + return std::string{}; + } + return std::string{name}; + }); + return {layerNameStrs, weightsRoles}; +} +} // namespace + +bool loadStreamingEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err) +{ + auto& reader = env.engine.getFileReader(); + SMP_RETVAL_IF_FALSE(reader.open(filepath), "", false, err << "Error opening engine file: " << filepath); + return true; +} + +bool loadEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err) +{ + auto const tBegin = std::chrono::high_resolution_clock::now(); + std::ifstream engineFile(filepath, std::ios::binary); + SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error opening engine file: " << filepath); + engineFile.seekg(0, std::ifstream::end); + int64_t fsize = engineFile.tellg(); + engineFile.seekg(0, std::ifstream::beg); + + std::vector engineBlob(fsize); + engineFile.read(reinterpret_cast(engineBlob.data()), fsize); + SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error loading engine file: " << filepath); + auto const tEnd = std::chrono::high_resolution_clock::now(); + float const loadTime = std::chrono::duration(tEnd - tBegin).count(); + sample::gLogInfo << "Engine loaded in " << loadTime << " sec." << std::endl; + sample::gLogInfo << "Loaded engine with size: " << (fsize / 1.0_MiB) << " MiB" << std::endl; + + env.engine.setBlob(std::move(engineBlob)); + + return true; +} + +bool printPlanVersion(BuildEnvironment& env, std::ostream& err) +{ + constexpr int64_t kPLAN_SIZE{28}; + std::vector data(kPLAN_SIZE); + auto blob = data.data(); + + auto& reader = env.engine.getFileReader(); + if (reader.isOpen()) + { + SMP_RETVAL_IF_FALSE(reader.read(data.data(), kPLAN_SIZE) == kPLAN_SIZE, "Failed to read plan file", false, err); + } + else + { + SMP_RETVAL_IF_FALSE(env.engine.getBlob().data != nullptr, "Plan file is empty", false, err); + SMP_RETVAL_IF_FALSE(env.engine.getBlob().size >= 28, "Plan file is incorrect", false, err); + blob = static_cast(env.engine.getBlob().data); + } + auto blob32 = reinterpret_cast(blob); + + //! Correct TensorRT plan file starts with this tag + constexpr uint32_t kPLAN_FILE_TAG{0x74727466U}; + SMP_RETVAL_IF_FALSE(blob32[0] == kPLAN_FILE_TAG, "Failed to verify a plan tag.", false, err); + switch (blob32[1]) + { + case 0U: + { + // Blob index to store the plan version may depend on the serialization version. + sample::gLogInfo << "Plan was created with TensorRT version " << static_cast(blob[24]) + << "." << static_cast(blob[25]) << "." << static_cast(blob[26]) + << "." << static_cast(blob[27]) << std::endl; + return true; + } + } + sample::gLogError << "Serialization version is not supported." << std::endl; + return false; +} + +void dumpRefittable(nvinfer1::ICudaEngine& engine) +{ + std::unique_ptr refitter{createRefitter(engine)}; + if (refitter == nullptr) + { + sample::gLogError << "Failed to create a refitter." << std::endl; + return; + } + + auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); + auto const& layerNames = layerWeightsRolePair.first; + auto const& weightsRoles = layerWeightsRolePair.second; + auto const nbAll = layerWeightsRolePair.first.size(); + for (size_t i = 0; i < nbAll; ++i) + { + sample::gLogInfo << layerNames[i] << " " << weightsRoles[i] << std::endl; + } +} + +ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err) +{ + BuildEnvironment env(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults()); + return loadEngineToBuildEnv(engine, env, err) ? env.engine.release() : nullptr; +} + +bool saveEngine(const ICudaEngine& engine, std::string const& fileName, std::ostream& err) +{ + std::ofstream engineFile(fileName, std::ios::binary); + if (!engineFile) + { + err << "Cannot open engine file: " << fileName << std::endl; + return false; + } + + std::unique_ptr serializedEngine{engine.serialize()}; + if (serializedEngine == nullptr) + { + err << "Engine serialization failed" << std::endl; + return false; + } + + engineFile.write(static_cast(serializedEngine->data()), serializedEngine->size()); + return !engineFile.fail(); +} + +bool getEngineBuildEnv( + const ModelOptions& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err) +{ + bool createEngineSuccess{false}; + + if (build.load) + { + if (build.safe) + { + createEngineSuccess = loadEngineToBuildEnv(build.engine, env, err); + } + else + { + createEngineSuccess = loadStreamingEngineToBuildEnv(build.engine, env, err); + } + } + else + { + createEngineSuccess = modelToBuildEnv(model, build, sys, env, err); + } + + SMP_RETVAL_IF_FALSE(createEngineSuccess, "Failed to create engine from model or file.", false, err); + + if (build.getPlanVersionOnly && build.load) + { + SMP_RETVAL_IF_FALSE(printPlanVersion(env, err), "Failed to get plan file version.", false, err); + return true; + } + + if (build.save) + { + std::ofstream engineFile(build.engine, std::ios::binary); + auto& engineBlob = env.engine.getBlob(); + engineFile.write(static_cast(engineBlob.data), engineBlob.size); + SMP_RETVAL_IF_FALSE(!engineFile.fail(), "Saving engine to file failed.", false, err); + engineFile.flush(); + engineFile.close(); + if (!build.safe) + { + env.engine.releaseBlob(); + SMP_RETVAL_IF_FALSE(loadStreamingEngineToBuildEnv(build.engine, env, err), "Reading engine file failed.", false, err); + } + } + + return true; +} + +// There is not a getWeightsName API, so we need to use WeightsRole. +std::vector> getAllRefitWeightsForLayer(const ILayer& l) +{ + switch (l.getType()) + { + case LayerType::kCONSTANT: + { + auto const& layer = static_cast(l); + auto const weights = layer.getWeights(); + switch (weights.type) + { + case DataType::kFLOAT: + case DataType::kHALF: + case DataType::kBF16: + case DataType::kINT8: + case DataType::kINT32: + case DataType::kINT64: return {std::make_pair(WeightsRole::kCONSTANT, weights)}; + case DataType::kBOOL: + case DataType::kUINT8: + case DataType::kFP8: + case DataType::kINT4: + // Refit not supported for these types. + break; + } + break; + } + case LayerType::kCONVOLUTION: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kDECONVOLUTION: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kSCALE: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kSCALE, layer.getScale()), + std::make_pair(WeightsRole::kSHIFT, layer.getShift())}; + } + case LayerType::kACTIVATION: + case LayerType::kASSERTION: + case LayerType::kCAST: + case LayerType::kCONCATENATION: + case LayerType::kCONDITION: + case LayerType::kCONDITIONAL_INPUT: + case LayerType::kCONDITIONAL_OUTPUT: + case LayerType::kDEQUANTIZE: + case LayerType::kEINSUM: + case LayerType::kELEMENTWISE: + case LayerType::kFILL: + case LayerType::kGATHER: + case LayerType::kGRID_SAMPLE: + case LayerType::kIDENTITY: + case LayerType::kITERATOR: + case LayerType::kLOOP_OUTPUT: + case LayerType::kLRN: + case LayerType::kMATRIX_MULTIPLY: + case LayerType::kNMS: + case LayerType::kNON_ZERO: + case LayerType::kNORMALIZATION: + case LayerType::kONE_HOT: + case LayerType::kPADDING: + case LayerType::kPARAMETRIC_RELU: + case LayerType::kPLUGIN: + case LayerType::kPLUGIN_V2: + case LayerType::kPLUGIN_V3: + case LayerType::kPOOLING: + case LayerType::kQUANTIZE: + case LayerType::kRAGGED_SOFTMAX: + case LayerType::kRECURRENCE: + case LayerType::kREDUCE: + case LayerType::kRESIZE: + case LayerType::kREVERSE_SEQUENCE: + case LayerType::kSCATTER: + case LayerType::kSELECT: + case LayerType::kSHAPE: + case LayerType::kSHUFFLE: + case LayerType::kSLICE: + case LayerType::kSOFTMAX: + case LayerType::kTOPK: + case LayerType::kTRIP_LIMIT: + case LayerType::kUNARY: return {}; + } + return {}; +} + +bool timeRefit(INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, bool multiThreading) +{ + using time_point = std::chrono::time_point; + using durationMs = std::chrono::duration; + + auto const nbLayers = network.getNbLayers(); + std::unique_ptr refitter{createRefitter(engine)}; + // Set max threads that can be used by refitter. + if (multiThreading && !refitter->setMaxThreads(10)) + { + sample::gLogError << "Failed to set max threads to refitter." << std::endl; + return false; + } + auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); + // We use std::string instead of char const* since we can have copies of layer names. + std::set> layerRoleSet; + + auto const& layerNames = layerWeightsRolePair.first; + auto const& weightsRoles = layerWeightsRolePair.second; + + std::transform(layerNames.begin(), layerNames.end(), weightsRoles.begin(), + std::inserter(layerRoleSet, layerRoleSet.begin()), + [](std::string const& layerName, WeightsRole const role) { return std::make_pair(layerName, role); }); + + auto const isRefittable = [&layerRoleSet](char const* layerName, WeightsRole const role) { + return layerRoleSet.find(std::make_pair(layerName, role)) != layerRoleSet.end(); + }; + + auto const setWeights = [&] { + for (int32_t i = 0; i < nbLayers; i++) + { + auto const layer = network.getLayer(i); + auto const roleWeightsVec = getAllRefitWeightsForLayer(*layer); + for (auto const& roleWeights : roleWeightsVec) + { + if (isRefittable(layer->getName(), roleWeights.first)) + { + bool const success = refitter->setWeights(layer->getName(), roleWeights.first, roleWeights.second); + if (!success) + { + return false; + } + } + } + } + return true; + }; + + auto const reportMissingWeights = [&] { + auto const& missingPair = getMissingLayerWeightsRolePair(*refitter); + auto const& layerNames = missingPair.first; + auto const& weightsRoles = missingPair.second; + for (size_t i = 0; i < layerNames.size(); ++i) + { + sample::gLogError << "Missing (" << layerNames[i] << ", " << weightsRoles[i] << ") for refitting." + << std::endl; + } + return layerNames.empty(); + }; + + // Skip weights validation since we are confident that the new weights are similar to the weights used to build + // engine. + refitter->setWeightsValidation(false); + + // Warm up and report missing weights + // We only need to set weights for the first time and that can be reused in later refitting process. + bool const success = setWeights() && reportMissingWeights() && refitter->refitCudaEngine(); + if (!success) + { + return false; + } + + TrtCudaStream stream; + constexpr int32_t kLOOP = 10; + time_point const refitStartTime{std::chrono::steady_clock::now()}; + { + for (int32_t l = 0; l < kLOOP; l++) + { + if (!refitter->refitCudaEngineAsync(stream.get())) + { + return false; + } + } + } + stream.synchronize(); + time_point const refitEndTime{std::chrono::steady_clock::now()}; + + sample::gLogInfo << "Engine refitted" + << " in " << durationMs(refitEndTime - refitStartTime).count() / kLOOP << " ms." << std::endl; + return true; +} + +namespace +{ +void* initSafeRuntime() +{ + void* handle{nullptr}; + // libsafe_executor.so will be renamed to libnvinfer_safe.so when TRTS-9421 completes. + // Currently libsafe_executor_debug.so for samplesCommon::isDebug() is not ready. +#define TRTS_9421_COMPLETED 0 +#if TRTS_9421_COMPLETED +#if !defined(_WIN32) + std::string const dllName{"libsafe_executor.so"}; +#if SANITIZER_BUILD + handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); +#else + // RTLD_GLOBAL is used for symbol resolution of subsequently loaded plugin libraries + handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_GLOBAL); +#endif +#endif +#endif // TRTS_9421_COMPLETED + return handle; +} + +#if !defined(_WIN32) +struct DllDeleter +{ + void operator()(void* handle) + { + if (handle != nullptr) + { + dlclose(handle); + } + } +}; +const std::unique_ptr safeRuntimeLibrary{initSafeRuntime()}; +#endif +} // namespace + +bool hasSafeRuntime() +{ + bool ret{false}; +#if !defined(_WIN32) + ret = (safeRuntimeLibrary != nullptr); +#endif + return ret; +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.h b/src/Detector/tensorrt_yolo/common/sampleEngines.h index 620b51a1..ec02e909 100644 --- a/src/Detector/tensorrt_yolo/common/sampleEngines.h +++ b/src/Detector/tensorrt_yolo/common/sampleEngines.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,58 +18,227 @@ #ifndef TRT_SAMPLE_ENGINES_H #define TRT_SAMPLE_ENGINES_H -#include -#include - #include "NvInfer.h" - -#if (NV_TENSORRT_MAJOR > 7) - -#include "NvInferConsistency.h" -#include "NvInferSafeRuntime.h" - -#endif - #include "NvOnnxParser.h" #include "sampleOptions.h" #include "sampleUtils.h" +#include "streamReader.h" +#include +#include namespace sample { struct Parser { - TrtUniquePtr onnxParser; + std::unique_ptr onnxParser; operator bool() const { - return onnxParser.operator bool(); + return onnxParser != nullptr; } }; -struct BuildEnvironment +//! +//! \brief Helper struct to faciliate engine serialization and deserialization. It does not own the underlying memory. +//! +struct EngineBlob { - TrtUniquePtr network; - //! Parser that creates the network. Must be declared *after* network, so that when - //! ~BuildEnvironment() executes, the parser is destroyed before the network is destroyed. - Parser parser; - TrtUniquePtr engine; - std::unique_ptr safeEngine; - std::vector engineBlob; + EngineBlob(void* engineData, size_t engineSize) + : data(engineData) + , size(engineSize) + { + } + void* data{}; + size_t size{}; + bool empty() const + { + return size == 0; + } }; //! -//! \brief Generate a network definition for a given model -//! -//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid -//! parser (the returned parser converts to false if tested) +//! \brief A helper class to hold a serialized engine (std or safe) and only deserialize it when being accessed. //! -//! Constant input dimensions in the model must not be changed in the corresponding -//! network definition, because its correctness may rely on the constants. -//! -//! \see Parser::operator bool() -//! -Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err); +class LazilyDeserializedEngine +{ +public: + //! + //! \brief Delete default constructor to make sure isSafe and DLACore are always set. + //! + LazilyDeserializedEngine() = delete; + + //! + //! \brief Constructor of LazilyDeserializedEngine. + //! + LazilyDeserializedEngine(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir, + nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath) + : mIsSafe(isSafe) + , mVersionCompatible(versionCompatible) + , mDLACore(DLACore) + , mTempdir(tempdir) + , mTempfileControls(tempfileControls) + , mLeanDLLPath(leanDLLPath) + { + mFileReader = std::make_unique(); + } + + //! + //! \brief Move from another LazilyDeserializedEngine. + //! + LazilyDeserializedEngine(LazilyDeserializedEngine&& other) = default; + + //! + //! \brief Delete copy constructor. + //! + LazilyDeserializedEngine(LazilyDeserializedEngine const& other) = delete; + + //! + //! \brief Get the pointer to the ICudaEngine. Triggers deserialization if not already done so. + //! + nvinfer1::ICudaEngine* get(); + + //! + //! \brief Get the pointer to the ICudaEngine and release the ownership. + //! + nvinfer1::ICudaEngine* release(); + + //! + //! \brief Get the underlying blob storing serialized engine. + //! + EngineBlob const getBlob() const + { + ASSERT((!mFileReader || !mFileReader->isOpen()) + && "Attempting to access the glob when there is an open file reader!"); + if (!mEngineBlob.empty()) + { + return EngineBlob{const_cast(static_cast(mEngineBlob.data())), mEngineBlob.size()}; + } + if (mEngineBlobHostMemory.get() != nullptr && mEngineBlobHostMemory->size() > 0) + { + return EngineBlob{mEngineBlobHostMemory->data(), mEngineBlobHostMemory->size()}; + } + ASSERT(false && "Attempting to access an empty engine!"); + return EngineBlob{nullptr, 0}; + } + + //! + //! \brief Set the underlying blob storing the serialized engine without duplicating IHostMemory. + //! + void setBlob(std::unique_ptr& data) + { + ASSERT(data.get() && data->size() > 0); + mEngineBlobHostMemory = std::move(data); + mEngine.reset(); + } + + //! + //! \brief Set the underlying blob storing the serialized engine without duplicating vector memory. + //! + void setBlob(std::vector&& engineBlob) + { + mEngineBlob = std::move(engineBlob); + mEngine.reset(); + } + + //! + //! \brief Release the underlying blob without deleting the deserialized engine. + //! + void releaseBlob() + { + mEngineBlob.clear(); + mEngineBlobHostMemory.reset(); + } + + //! + //! \brief Get the file stream reader used for deserialization + //! + samplesCommon::FileStreamReader& getFileReader() + { + ASSERT(mFileReader); + return *mFileReader; + } + + //! + //! \brief Get if safe mode is enabled. + //! + bool isSafe() + { + return mIsSafe; + } + + void setDynamicPlugins(std::vector const& dynamicPlugins) + { + mDynamicPlugins = dynamicPlugins; + } + +private: + bool mIsSafe{false}; + bool mVersionCompatible{false}; + int32_t mDLACore{-1}; + std::vector mEngineBlob; + std::unique_ptr mFileReader; + + // Directly use the host memory of a serialized engine instead of duplicating the engine in CPU memory. + std::unique_ptr mEngineBlobHostMemory; + + std::string mTempdir{}; + nvinfer1::TempfileControlFlags mTempfileControls{getTempfileControlDefaults()}; + std::string mLeanDLLPath{}; + std::vector mDynamicPlugins; + + //! \name Owned TensorRT objects + //! Per TensorRT object lifetime requirements as outlined in the developer guide, + //! the runtime must remain live while any engines created by the runtime are live. + //! DO NOT ADJUST the declaration order here: runtime -> (engine). + //! Destruction occurs in reverse declaration order: (engine) -> runtime. + //!@{ + + //! The runtime used to track parent of mRuntime if one exists. + //! Needed to load mRuntime if lean.so is supplied through file system path. + std::unique_ptr mParentRuntime{}; + + //! The runtime that is used to deserialize the engine. + std::unique_ptr mRuntime{}; + + //! If mIsSafe is false, this points to the deserialized std engine + std::unique_ptr mEngine{}; + + //!@} +}; + +struct BuildEnvironment +{ + BuildEnvironment() = delete; + BuildEnvironment(BuildEnvironment const& other) = delete; + BuildEnvironment(BuildEnvironment&& other) = delete; + BuildEnvironment(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir, + nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath = "") + : engine(isSafe, versionCompatible, DLACore, tempdir, tempfileControls, leanDLLPath) + { + } + + //! \name Owned TensorRT objects + //! Per TensorRT object lifetime requirements as outlined in the developer guide, + //! factory objects must remain live while the objects created by those factories + //! are live (with the exception of builder -> engine). + //! DO NOT ADJUST the declaration order here: builder -> network -> parser. + //! Destruction occurs in reverse declaration order: parser -> network -> builder. + //!@{ + + //! The builder used to build the engine. + std::unique_ptr builder; + + //! The network used by the builder. + std::unique_ptr network; + + //! The parser used to specify the network. + Parser parser; + + //! The engine. + LazilyDeserializedEngine engine; + //!@} +}; //! //! \brief Set up network and config @@ -89,95 +259,63 @@ void dumpRefittable(nvinfer1::ICudaEngine& engine); //! //! \return Pointer to the engine loaded or nullptr if the operation failed //! -nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err); +nvinfer1::ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err); //! //! \brief Save an engine into a file //! //! \return boolean Return true if the engine was successfully saved //! -bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err); +bool saveEngine(nvinfer1::ICudaEngine const& engine, std::string const& fileName, std::ostream& err); //! //! \brief Create an engine from model or serialized file, and optionally save engine //! //! \return Pointer to the engine created or nullptr if the creation failed //! -bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, - BuildEnvironment& env, std::ostream& err); - -//! -//! \brief Create an engine from model or serialized file, and optionally save engine -//! -//! \return Pointer to the engine created or nullptr if the creation failed -//! -inline TrtUniquePtr getEngine( - const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err) -{ - BuildEnvironment env; - TrtUniquePtr engine; - if (getEngineBuildEnv(model, build, sys, env, err)) - { - engine.swap(env.engine); - } - return engine; -} +bool getEngineBuildEnv( + ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err); //! //! \brief Create a serialized network //! //! \return Pointer to a host memory for a serialized network //! -nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, - nvinfer1::INetworkDefinition& network, std::ostream& err); +nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, + nvinfer1::IBuilder& builder, nvinfer1::INetworkDefinition& network, std::ostream& err); //! //! \brief Tranfer model to a serialized network //! //! \return Pointer to a host memory for a serialized network //! -nvinfer1::IHostMemory* modelToSerialized(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); +nvinfer1::IHostMemory* modelToSerialized( + const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); //! //! \brief Serialize network and save it into a file //! //! \return boolean Return true if the network was successfully serialized and saved //! -bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); +bool serializeAndSave( + const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading); //! //! \brief Set tensor scales from a calibration table //! -void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector& inputFormats, - const std::vector& outputFormats, const std::string& calibrationFile); +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector const& inputFormats, + std::vector const& outputFormats, std::string const& calibrationFile); //! //! \brief Check if safe runtime is loaded. //! bool hasSafeRuntime(); -//! -//! \brief Create a safe runtime object if the dynamic library is loaded. -//! -nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept; - -//! -//! \brief Check if consistency checker is loaded. -//! -bool hasConsistencyChecker(); +bool loadStreamingEngineToBuildEnv(std::string const& engine, BuildEnvironment& env, std::ostream& err); -//! -//! \brief Create a consistency checker object if the dynamic library is loaded. -//! -nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( - nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept; - -//! -//! \brief Run consistency check on serialized engine. -//! -bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize); +bool loadEngineToBuildEnv(std::string const& engine, BuildEnvironment& env, std::ostream& err); } // namespace sample #endif // TRT_SAMPLE_ENGINES_H diff --git a/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h b/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h new file mode 100644 index 00000000..cc8bf1b9 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h @@ -0,0 +1,101 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_ENTRYPOINTS_H +#define TRT_SAMPLE_ENTRYPOINTS_H + +//! \file sampleEntrypoints.h +//! +//! Declares and conditionally defines entrypoints needed to create base TensorRT objects, depending +//! on whether the given sample uses TRT at link time or dynamically. Since common code is built once +//! and shared across all samples (both link-time and dynamic TRT), it does not define these entrypoints, +//! so each sample must define them individually. +//! +//! Samples that use TRT at link time can define DEFINE_TRT_ENTRYPOINTS before including this header to +//! pick up the definitions here. + +#include "NvInfer.h" +#include "NvOnnxParser.h" +#include "logger.h" + +extern nvinfer1::IBuilder* createBuilder(); +extern nvinfer1::IRuntime* createRuntime(); +extern nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine); + +extern nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network); + +#if !defined(DEFINE_TRT_ENTRYPOINTS) +#define DEFINE_TRT_ENTRYPOINTS 0 +#endif + +// Allow opting out of individual entrypoints that are unused by the sample +#if !defined(DEFINE_TRT_BUILDER_ENTRYPOINT) +#define DEFINE_TRT_BUILDER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_RUNTIME_ENTRYPOINT) +#define DEFINE_TRT_RUNTIME_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_REFITTER_ENTRYPOINT) +#define DEFINE_TRT_REFITTER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_ONNX_PARSER_ENTRYPOINT) +#define DEFINE_TRT_ONNX_PARSER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT) +#define DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT 1 +#endif + +#if DEFINE_TRT_ENTRYPOINTS +nvinfer1::IBuilder* createBuilder() +{ +#if DEFINE_TRT_BUILDER_ENTRYPOINT + return nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvinfer1::IRuntime* createRuntime() +{ +#if DEFINE_TRT_RUNTIME_ENTRYPOINT + return nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine) +{ +#if DEFINE_TRT_REFITTER_ENTRYPOINT + return nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network) +{ +#if DEFINE_TRT_ONNX_PARSER_ENTRYPOINT + return nvonnxparser::createParser(network, sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +#endif // DEFINE_TRT_ENTRYPOINTS + +#endif // TRT_SAMPLE_ENTRYPOINTS_H diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.cpp_ b/src/Detector/tensorrt_yolo/common/sampleInference.cpp_ new file mode 100644 index 00000000..ca0098d4 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleInference.cpp_ @@ -0,0 +1,1622 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__QNX__) +#include +#include +#endif + +#include "NvInfer.h" + +#include "ErrorRecorder.h" +#include "bfloat16.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleEngines.h" +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleReporting.h" +#include "sampleUtils.h" +using namespace nvinfer1; +namespace sample +{ + +template +bool validateTensorNames(TMapType const& map, TEngineType const* engine, int32_t const endBindingIndex) +{ + // Check if the provided input tensor names match the input tensors of the engine. + // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. + for (auto const& item : map) + { + bool tensorNameFound{false}; + for (int32_t b = 0; b < endBindingIndex; ++b) + { + auto const tensorName = engine->getIOTensorName(b); + auto const tensorIOMode = engine->getTensorIOMode(tensorName); + if (tensorIOMode == nvinfer1::TensorIOMode::kINPUT && matchStringWithOneWildcard(item.first, tensorName)) + { + tensorNameFound = true; + break; + } + } + if (!tensorNameFound) + { + sample::gLogError << "Cannot find input tensor with name \"" << item.first << "\" in the engine bindings! " + << "Please make sure the input tensor names are correct." << std::endl; + return false; + } + } + return true; +} + +template +class FillBindingClosure +{ +private: + using InputsMap = std::unordered_map; + using BindingsVector = std::vector>; + + TEngineType const* mEngine; + nvinfer1::IExecutionContext const* mContext; + InputsMap const& inputs; + BindingsVector& bindings; + int32_t batch; + int32_t endBindingIndex; + int32_t profileIndex; + + void fillOneBinding(TensorInfo const& tensorInfo) + { + auto const name = tensorInfo.name; + auto const* bindingInOutStr = tensorInfo.isInput ? "Input" : "Output"; + for (auto& binding : bindings) + { + auto const input = findPlausible(inputs, name); + if (tensorInfo.isInput && input != inputs.end()) + { + sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl; + binding->addBinding(tensorInfo, input->second); + } + else + { + if (tensorInfo.isInput) + { + sample::gLogInfo << "Using random values for input " << name << std::endl; + } + binding->addBinding(tensorInfo); + } + if (tensorInfo.isDynamic) + { + sample::gLogInfo << bindingInOutStr << " binding for " << name + << " is dynamic and will be created during execution using OutputAllocator." + << std::endl; + } + else + { + sample::gLogInfo << bindingInOutStr << " binding for " << name << " with dimensions " << tensorInfo.dims + << " is created." << std::endl; + } + } + } + + bool fillAllBindings(int32_t batch, int32_t endBindingIndex) + { + if (!validateTensorNames(inputs, mEngine, endBindingIndex)) + { + sample::gLogError << "Invalid tensor names found in --loadInputs flag." << std::endl; + return false; + } + for (int32_t b = 0; b < endBindingIndex; b++) + { + TensorInfo tensorInfo; + tensorInfo.bindingIndex = b; + getTensorInfo(tensorInfo); + tensorInfo.updateVolume(batch); + fillOneBinding(tensorInfo); + } + return true; + } + + void getTensorInfo(TensorInfo& tensorInfo); + +public: + FillBindingClosure(TEngineType const* _engine, nvinfer1::IExecutionContext const* _context, + InputsMap const& _inputs, BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex, + int32_t _profileIndex) + : mEngine(_engine) + , mContext(_context) + , inputs(_inputs) + , bindings(_bindings) + , batch(_batch) + , endBindingIndex(_endBindingIndex) + , profileIndex(_profileIndex) + { + } + + bool operator()() + { + return fillAllBindings(batch, endBindingIndex); + } +}; + +template <> +void FillBindingClosure::getTensorInfo(TensorInfo& tensorInfo) +{ + auto const b = tensorInfo.bindingIndex; + auto const name = mEngine->getIOTensorName(b); + tensorInfo.name = name; + tensorInfo.dims = mContext->getTensorShape(name); + tensorInfo.isDynamic = std::any_of( + tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; }); + tensorInfo.comps = mEngine->getTensorComponentsPerElement(name, profileIndex); + tensorInfo.strides = mContext->getTensorStrides(name); + tensorInfo.vectorDimIndex = mEngine->getTensorVectorizedDim(name, profileIndex); + tensorInfo.isInput = mEngine->getTensorIOMode(name) == TensorIOMode::kINPUT; + tensorInfo.dataType = mEngine->getTensorDataType(name); +} + +namespace +{ +bool allocateContextMemory(InferenceEnvironment& iEnv, InferenceOptions const& inference) +{ + auto* engine = iEnv.engine.get(); + iEnv.deviceMemory.resize(inference.infStreams); + // Delay context memory allocation until input shapes are specified because runtime allocation would require actual + // input shapes. + for (int32_t i = 0; i < inference.infStreams; ++i) + { + auto const& ec = iEnv.contexts.at(i); + if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC) + { + sample::gLogInfo << "Created execution context with device memory size: " + << (engine->getDeviceMemorySize() / 1.0_MiB) << " MiB" << std::endl; + } + else + { + size_t sizeToAlloc{0}; + const char* allocReason{nullptr}; + if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kPROFILE) + { + auto const p = inference.optProfileIndex; + sizeToAlloc = engine->getDeviceMemorySizeForProfile(p); + allocReason = "current profile"; + } + else if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kRUNTIME) + { + sizeToAlloc = ec->updateDeviceMemorySizeForShapes(); + allocReason = "current input shapes"; + } + else + { + sample::gLogError << "Unrecognizable memory allocation strategy." << std::endl; + return false; + } + iEnv.deviceMemory.at(i) = TrtDeviceBuffer(sizeToAlloc); + ec->setDeviceMemoryV2(iEnv.deviceMemory.at(i).get(), iEnv.deviceMemory.at(i).getSize()); + sample::gLogInfo << "Maximum device memory size across all profiles: " + << (engine->getDeviceMemorySizeV2() / 1.0_MiB) << " MiB" << std::endl; + sample::gLogInfo << "Only allocated device memory enough for " << allocReason << ": " + << (sizeToAlloc / 1.0_MiB) << " MiB" << std::endl; + } + } + return true; +} +} // namespace + +bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system) +{ +#if TRT_WINML + int32_t const isIntegrated{}; +#else + int32_t device{}; + cudaCheck(cudaGetDevice(&device)); + + cudaDeviceProp properties; + cudaCheck(cudaGetDeviceProperties(&properties, device)); + int32_t const isIntegrated{properties.integrated}; +#endif + // Use managed memory on integrated devices when transfers are skipped + // and when it is explicitly requested on the commandline. + bool useManagedMemory{(inference.skipTransfers && isIntegrated) || inference.useManaged}; + SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); + + using FillStdBindings = FillBindingClosure; + + auto* engine = iEnv.engine.get(); + SMP_RETVAL_IF_FALSE(engine != nullptr, "Got invalid engine!", false, sample::gLogError); + + // Release serialized blob to save memory space. + iEnv.engine.releaseBlob(); + + // Setup weight streaming if enabled + if (engine->getStreamableWeightsSize() > 0) + { + auto const& budget = inference.weightStreamingBudget; + int64_t wsBudget = budget.bytes; + if (budget.percent != 100.0) + { + double const percent = budget.percent; + ASSERT(percent < 100.0); + auto const max = engine->getStreamableWeightsSize(); + wsBudget = (max >= 0) ? (percent / 100) * (max) : WeightStreamingBudget::kDISABLE; + } + + if (wsBudget == WeightStreamingBudget::kDISABLE) + { + wsBudget = engine->getStreamableWeightsSize(); + } + else if (wsBudget == WeightStreamingBudget::kAUTOMATIC) + { + wsBudget = engine->getWeightStreamingAutomaticBudget(); + } + ASSERT(wsBudget >= 0); + bool success = engine->setWeightStreamingBudgetV2(wsBudget); + SMP_RETVAL_IF_FALSE(success, "Failed to set weight streaming limit!", false, sample::gLogError); + switch (wsBudget) + { + case WeightStreamingBudget::kDISABLE: + { + sample::gLogInfo << "Weight streaming has been disabled at runtime." << std::endl; + break; + } + + case WeightStreamingBudget::kAUTOMATIC: + { + sample::gLogInfo << "The weight streaming budget will automatically be chosen by TensorRT." << std::endl; + break; + } + default: + { + sample::gLogInfo << "Weight streaming is enabled with a device memory limit of " << wsBudget << " bytes." + << std::endl; + break; + } + } + } + + int32_t const nbOptProfiles = engine->getNbOptimizationProfiles(); + + if (inference.optProfileIndex >= nbOptProfiles) + { + sample::gLogError << "Selected profile index " << inference.optProfileIndex + << " exceeds the number of profiles that the engine holds. " << std::endl; + return false; + } + + if (nbOptProfiles > 1 && !inference.setOptProfile) + { + sample::gLogWarning << nbOptProfiles + << " profiles detected but not set. Running with profile 0. Please use " + "--dumpOptimizationProfile to see all available profiles." + << std::endl; + } + + cudaStream_t setOptProfileStream; + CHECK(cudaStreamCreate(&setOptProfileStream)); + + for (int32_t s = 0; s < inference.infStreams; ++s) + { + IExecutionContext* ec{nullptr}; + if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC) + { + // Let TRT pre-allocate and manage the memory. + ec = engine->createExecutionContext(); + } + else + { + // Allocate based on the current profile or runtime shapes. + ec = engine->createExecutionContext(ExecutionContextAllocationStrategy::kUSER_MANAGED); + } + if (ec == nullptr) + { + sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl; + return false; + } + ec->setNvtxVerbosity(inference.nvtxVerbosity); + +#if !TRT_WINML + int32_t const persistentCacheLimit + = samplesCommon::getMaxPersistentCacheSize() * inference.persistentCacheRatio; + sample::gLogInfo << "Setting persistentCacheLimit to " << persistentCacheLimit << " bytes." << std::endl; + ec->setPersistentCacheLimit(persistentCacheLimit); +#endif + + auto setProfile = ec->setOptimizationProfileAsync(inference.optProfileIndex, setOptProfileStream); + CHECK(cudaStreamSynchronize(setOptProfileStream)); + + if (!setProfile) + { + sample::gLogError << "Set optimization profile failed. " << std::endl; + if (inference.infStreams > 1) + { + sample::gLogError + << "Please ensure that the engine is built with preview feature profileSharing0806 enabled. " + << std::endl; + } + return false; + } + + iEnv.contexts.emplace_back(ec); + iEnv.bindings.emplace_back(new Bindings(useManagedMemory)); + } + + CHECK(cudaStreamDestroy(setOptProfileStream)); + + if (iEnv.profiler) + { + iEnv.contexts.front()->setProfiler(iEnv.profiler.get()); + // Always run reportToProfiler() after enqueue launch + iEnv.contexts.front()->setEnqueueEmitsProfile(false); + } + + int32_t const endBindingIndex = engine->getNbIOTensors(); + + // Make sure that the tensor names provided in command-line args actually exist in any of the engine bindings + // to avoid silent typos. + if (!validateTensorNames(inference.shapes, engine, endBindingIndex)) + { + sample::gLogError << "Invalid tensor names found in --shapes flag." << std::endl; + return false; + } + + for (int32_t b = 0; b < endBindingIndex; ++b) + { + auto const& name = engine->getIOTensorName(b); + auto const& mode = engine->getTensorIOMode(name); + if (mode == TensorIOMode::kINPUT) + { + Dims const dims = iEnv.contexts.front()->getTensorShape(name); + bool isShapeInferenceIO{false}; + isShapeInferenceIO = engine->isShapeInferenceIO(name); + bool const hasRuntimeDim = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + auto const shape = findPlausible(inference.shapes, name); + if (hasRuntimeDim || isShapeInferenceIO) + { + // Set shapeData to either dimensions of the input (if it has a dynamic shape) + // or set to values of the input (if it is an input shape tensor). + std::vector shapeData; + + if (shape == inference.shapes.end()) + { + // No information provided. Use default value for missing data. + constexpr int32_t kDEFAULT_VALUE = 1; + if (isShapeInferenceIO) + { + // Set shape tensor to all ones. + shapeData.assign(volume(dims, 0, dims.nbDims), kDEFAULT_VALUE); + sample::gLogWarning << "Values missing for input shape tensor: " << name + << "Automatically setting values to: " << shapeData << std::endl; + } + else + { + // Use default value for unspecified runtime dimensions. + shapeData.resize(dims.nbDims); + std::transform(dims.d, dims.d + dims.nbDims, shapeData.begin(), + [&](int32_t dimension) { return dimension >= 0 ? dimension : kDEFAULT_VALUE; }); + sample::gLogWarning << "Shape missing for input with dynamic shape: " << name + << "Automatically setting shape to: " << shapeData << std::endl; + } + } + else if (inference.inputs.count(shape->first) && isShapeInferenceIO) + { + // Load shape tensor from file. + int64_t const size = volume(dims, 0, dims.nbDims); + shapeData.resize(size); + auto const& filename = inference.inputs.at(shape->first); + auto dst = reinterpret_cast(shapeData.data()); + loadFromFile(filename, dst, size * sizeof(decltype(shapeData)::value_type)); + } + else + { + shapeData = shape->second; + } + + int32_t* shapeTensorData{nullptr}; + if (isShapeInferenceIO) + { + // Save the data in iEnv, in a way that it's address does not change + // before enqueueV3 is called. + iEnv.inputShapeTensorValues.emplace_back(shapeData); + shapeTensorData = iEnv.inputShapeTensorValues.back().data(); + } + + for (auto& c : iEnv.contexts) + { + if (isShapeInferenceIO) + { + sample::gLogInfo << "Set input shape tensor " << name << " to: " << shapeData << std::endl; + if (!c->setTensorAddress(name, shapeTensorData)) + { + return false; + } + } + else + { + sample::gLogInfo << "Set shape of input tensor " << name << " to: " << shapeData + << std::endl; + if (!c->setInputShape(name, toDims(shapeData))) + { + return false; + } + } + } + } + else if (nbOptProfiles && shape != inference.shapes.end()) + { + // Check if the provided shape matches the static dimensions in the engine. + for (auto& c : iEnv.contexts) + { + if (!c->setInputShape(name, toDims(shape->second))) + { + sample::gLogError << "The engine was built with static shapes for input tensor " << name + << " but the provided shapes do not match the static shapes!" << std::endl; + return false; + } + } + } + } + } + + // Create Debug Listener and turn on debug states if client requested dumping debug tensors. + if (!inference.debugTensorFileNames.empty()) + { + iEnv.listener.reset(new DebugTensorWriter(inference.debugTensorFileNames)); + iEnv.contexts.front()->setDebugListener(iEnv.listener.get()); + for (auto const& s : inference.debugTensorFileNames) + { + iEnv.contexts.front()->setTensorDebugState(s.first.c_str(), true); + } + } + + if (!allocateContextMemory(iEnv, inference)) + { + return false; + } + + auto const* context = iEnv.contexts.front().get(); + return FillStdBindings( + engine, context, inference.inputs, iEnv.bindings, 1, endBindingIndex, inference.optProfileIndex)(); +} + +TaskInferenceEnvironment::TaskInferenceEnvironment( + std::string engineFile, InferenceOptions inference, int32_t deviceId, int32_t DLACore, int32_t bs) + : iOptions(inference) + , device(deviceId) + , batch(bs) +{ + BuildEnvironment bEnv(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults()); + loadEngineToBuildEnv(engineFile, bEnv, sample::gLogError); + std::unique_ptr tmp(new InferenceEnvironment(bEnv)); + iEnv = std::move(tmp); + + cudaCheck(cudaSetDevice(device)); + SystemOptions system{}; + system.device = device; + system.DLACore = DLACore; + if (!setUpInference(*iEnv, iOptions, system)) + { + sample::gLogError << "Inference set up failed" << std::endl; + } +} +namespace +{ + +#if defined(__QNX__) +using TimePoint = double; +#else +using TimePoint = std::chrono::time_point; +#endif + +TimePoint getCurrentTime() +{ +#if defined(__QNX__) + uint64_t const currentCycles = ClockCycles(); + uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec; + // Return current timestamp in ms. + return static_cast(currentCycles) * 1000. / cyclesPerSecond; +#else + return std::chrono::high_resolution_clock::now(); +#endif +} + +//! +//! \struct SyncStruct +//! \brief Threads synchronization structure +//! +struct SyncStruct +{ + std::mutex mutex; + TrtCudaStream mainStream; + TrtCudaEvent gpuStart{cudaEventBlockingSync}; + TimePoint cpuStart{}; + float sleep{}; +}; + +struct Enqueue +{ + explicit Enqueue(nvinfer1::IExecutionContext& context) + : mContext(context) + { + } + + nvinfer1::IExecutionContext& mContext; +}; + +//! +//! \class EnqueueExplicit +//! \brief Functor to enqueue inference with explict batch +//! +class EnqueueExplicit : private Enqueue +{ + +public: + explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, Bindings const& bindings) + : Enqueue(context) + , mBindings(bindings) + { + ASSERT(mBindings.setTensorAddresses(mContext)); + } + + bool operator()(TrtCudaStream& stream) const + { + try + { + bool const result = mContext.enqueueV3(stream.get()); + // Collecting layer timing info from current profile index of execution context, except under capturing + // mode. + if (!isStreamCapturing(stream) && mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() + && !mContext.reportToProfiler()) + { + gLogWarning << "Failed to collect layer timing info from previous enqueueV3()" << std::endl; + } + return result; + } + catch (const std::exception&) + { + return false; + } + return false; + } + +private: + // Helper function to check if a stream is in capturing mode. + bool isStreamCapturing(TrtCudaStream& stream) const + { + cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone}; + cudaCheck(cudaStreamIsCapturing(stream.get(), &status)); + return status != cudaStreamCaptureStatusNone; + } + + Bindings const& mBindings; +}; + +//! +//! \class EnqueueGraph +//! \brief Functor to enqueue inference from CUDA Graph +//! +class EnqueueGraph +{ + +public: + explicit EnqueueGraph(nvinfer1::IExecutionContext& context, TrtCudaGraph& graph) + : mGraph(graph) + , mContext(context) + { + } + + bool operator()(TrtCudaStream& stream) const + { + if (mGraph.launch(stream)) + { + // Collecting layer timing info from current profile index of execution context + if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) + { + gLogWarning << "Failed to collect layer timing info from previous CUDA graph launch" << std::endl; + } + return true; + } + return false; + } + + TrtCudaGraph& mGraph; + nvinfer1::IExecutionContext& mContext; +}; + +//! +//! \class EnqueueGraphSafe +//! \brief Functor to enqueue inference from CUDA Graph +//! +class EnqueueGraphSafe +{ + +public: + explicit EnqueueGraphSafe(TrtCudaGraph& graph) + : mGraph(graph) + { + } + + bool operator()(TrtCudaStream& stream) const + { + return mGraph.launch(stream); + } + + TrtCudaGraph& mGraph; +}; + +using EnqueueFunction = std::function; + +enum class StreamType : int32_t +{ + kINPUT = 0, + kCOMPUTE = 1, + kOUTPUT = 2, + kNUM = 3 +}; + +enum class EventType : int32_t +{ + kINPUT_S = 0, + kINPUT_E = 1, + kCOMPUTE_S = 2, + kCOMPUTE_E = 3, + kOUTPUT_S = 4, + kOUTPUT_E = 5, + kNUM = 6 +}; + +using MultiStream = std::array(StreamType::kNUM)>; + +using MultiEvent = std::array, static_cast(EventType::kNUM)>; + +using EnqueueTimes = std::array; + +//! +//! \class Iteration +//! \brief Inference iteration and streams management +//! +class Iteration +{ + +public: + Iteration(int32_t id, InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings) + : mBindings(bindings) + , mStreamId(id) + , mDepth(1 + inference.overlap) + , mActive(mDepth) + , mEvents(mDepth) + , mEnqueueTimes(mDepth) + , mContext(&context) + { + for (int32_t d = 0; d < mDepth; ++d) + { + for (int32_t e = 0; e < static_cast(EventType::kNUM); ++e) + { + mEvents[d][e].reset(new TrtCudaEvent(!inference.spin)); + } + } + createEnqueueFunction(inference, context, bindings); + } + + bool query(bool skipTransfers) + { + if (mActive[mNext]) + { + return true; + } + + if (!skipTransfers) + { + record(EventType::kINPUT_S, StreamType::kINPUT); + setInputData(false); + record(EventType::kINPUT_E, StreamType::kINPUT); + wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute + } + + record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE); + recordEnqueueTime(); + if (!mEnqueue(getStream(StreamType::kCOMPUTE))) + { + return false; + } + recordEnqueueTime(); + record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE); + + if (!skipTransfers) + { + wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA + record(EventType::kOUTPUT_S, StreamType::kOUTPUT); + fetchOutputData(false); + record(EventType::kOUTPUT_E, StreamType::kOUTPUT); + } + + mActive[mNext] = true; + moveNext(); + return true; + } + + float sync( + TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector& trace, bool skipTransfers) + { + if (mActive[mNext]) + { + if (skipTransfers) + { + getEvent(EventType::kCOMPUTE_E).synchronize(); + } + else + { + getEvent(EventType::kOUTPUT_E).synchronize(); + } + trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers)); + mActive[mNext] = false; + return getEvent(EventType::kCOMPUTE_S) - gpuStart; + } + return 0; + } + + void syncAll( + TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector& trace, bool skipTransfers) + { + for (int32_t d = 0; d < mDepth; ++d) + { + sync(cpuStart, gpuStart, trace, skipTransfers); + moveNext(); + } + } + + void wait(TrtCudaEvent& gpuStart) + { + getStream(StreamType::kINPUT).wait(gpuStart); + } + + void setInputData(bool sync) + { + mBindings.transferInputToDevice(getStream(StreamType::kINPUT)); + // additional sync to avoid overlapping with inference execution. + if (sync) + { + getStream(StreamType::kINPUT).synchronize(); + } + } + + void fetchOutputData(bool sync) + { + mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT)); + // additional sync to avoid overlapping with inference execution. + if (sync) + { + getStream(StreamType::kOUTPUT).synchronize(); + } + } + +private: + void moveNext() + { + mNext = mDepth - 1 - mNext; + } + + TrtCudaStream& getStream(StreamType t) + { + return mStream[static_cast(t)]; + } + + TrtCudaEvent& getEvent(EventType t) + { + return *mEvents[mNext][static_cast(t)]; + } + + void record(EventType e, StreamType s) + { + getEvent(e).record(getStream(s)); + } + + void recordEnqueueTime() + { + mEnqueueTimes[mNext][enqueueStart] = getCurrentTime(); + enqueueStart = 1 - enqueueStart; + } + + TimePoint getEnqueueTime(bool start) + { + return mEnqueueTimes[mNext][start ? 0 : 1]; + } + + void wait(EventType e, StreamType s) + { + getStream(s).wait(getEvent(e)); + } + + InferenceTrace getTrace(TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, bool skipTransfers) + { + float is + = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart; + float ie + = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart; + float os + = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart; + float oe + = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart; + + return InferenceTrace(mStreamId, + std::chrono::duration(getEnqueueTime(true) - cpuStart).count(), + std::chrono::duration(getEnqueueTime(false) - cpuStart).count(), is, ie, + getEvent(EventType::kCOMPUTE_S) - gpuStart, getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe); + } + + void createEnqueueFunction( + InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings) + { + mEnqueue = EnqueueFunction(EnqueueExplicit(context, mBindings)); + if (inference.graph) + { + sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl; + + TrtCudaStream& stream = getStream(StreamType::kCOMPUTE); + // Avoid capturing initialization calls by executing the enqueue function at least + // once before starting CUDA graph capture. + auto const ret = mEnqueue(stream); + if (!ret) + { + throw std::runtime_error("Inference enqueue failed."); + } + stream.synchronize(); + + mGraph.beginCapture(stream); + // The built TRT engine may contain operations that are not permitted under CUDA graph capture mode. + // When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails. + if (mEnqueue(stream)) + { + mGraph.endCapture(stream); + mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph)); + sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl; + } + else + { + mGraph.endCaptureOnError(stream); + // Ensure any CUDA error has been cleaned up. + cudaCheck(cudaGetLastError()); + sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under " + "CUDA graph capture mode." + << std::endl; + sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be " + "launched without using CUDA graph launch." + << std::endl; + } + } + } + + Bindings& mBindings; + + TrtCudaGraph mGraph; + EnqueueFunction mEnqueue; + + int32_t mStreamId{0}; + int32_t mNext{0}; + int32_t mDepth{2}; // default to double buffer to hide DMA transfers + + std::vector mActive; + MultiStream mStream; + std::vector mEvents; + + int32_t enqueueStart{0}; + std::vector mEnqueueTimes; + nvinfer1::IExecutionContext* mContext{nullptr}; +}; + +bool inferenceLoop(std::vector>& iStreams, TimePoint const& cpuStart, + TrtCudaEvent const& gpuStart, int iterations, float maxDurationMs, float warmupMs, + std::vector& trace, bool skipTransfers, float idleMs) +{ + float durationMs = 0; + int32_t skip = 0; + + if (maxDurationMs == -1.F) + { + sample::gLogWarning << "--duration=-1 is specified, inference will run in an endless loop until" + << " aborted with CTRL-C (SIGINT)" << std::endl; + while (true) + { + for (auto& s : iStreams) + { + if (!s->query(skipTransfers)) + { + return false; + } + } + for (auto& s : iStreams) + { + s->sync(cpuStart, gpuStart, trace, skipTransfers); + } + } + } + + for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i) + { + for (auto& s : iStreams) + { + if (!s->query(skipTransfers)) + { + return false; + } + } + for (auto& s : iStreams) + { + durationMs = std::max(durationMs, s->sync(cpuStart, gpuStart, trace, skipTransfers)); + } + if (durationMs < warmupMs) // Warming up + { + if (durationMs) // Skip complete iterations + { + ++skip; + } + continue; + } + if (idleMs != 0.F) + { + std::this_thread::sleep_for(std::chrono::duration(idleMs)); + } + } + for (auto& s : iStreams) + { + s->syncAll(cpuStart, gpuStart, trace, skipTransfers); + } + return true; +} + +void inferenceExecution(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync, + int32_t const threadIdx, int32_t const streamsPerThread, int32_t device, + std::vector& trace) noexcept +{ + try + { + float warmupMs = inference.warmup; + float durationMs = -1.F; + if (inference.duration != -1.F) + { + durationMs = inference.duration * 1000.F + warmupMs; + } + + cudaCheck(cudaSetDevice(device)); + + std::vector> iStreams; + + for (int32_t s = 0; s < streamsPerThread; ++s) + { + int32_t const streamId{threadIdx * streamsPerThread + s}; + auto* iteration = new Iteration(streamId, inference, *iEnv.getContext(streamId), *iEnv.bindings[streamId]); + if (inference.skipTransfers) + { + iteration->setInputData(true); + } + iStreams.emplace_back(iteration); + } + + for (auto& s : iStreams) + { + s->wait(sync.gpuStart); + } + + std::vector localTrace; + if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs, + localTrace, inference.skipTransfers, inference.idle)) + { + sync.mutex.lock(); + iEnv.error = true; + sync.mutex.unlock(); + } + + if (inference.skipTransfers) + { + for (auto& s : iStreams) + { + s->fetchOutputData(true); + } + } + + sync.mutex.lock(); + trace.insert(trace.end(), localTrace.begin(), localTrace.end()); + sync.mutex.unlock(); + } + catch (...) + { + sync.mutex.lock(); + iEnv.error = true; + sync.mutex.unlock(); + } +} + +inline std::thread makeThread(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync, + int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector& trace) +{ + return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), std::ref(sync), threadIdx, + streamsPerThread, device, std::ref(trace)); +} + +} // namespace + +bool runInference( + InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace) +{ + SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); + cudaCheck(cudaProfilerStart()); + + trace.resize(0); + + SyncStruct sync; + sync.sleep = inference.sleep; + sync.mainStream.sleep(&sync.sleep); + sync.cpuStart = getCurrentTime(); + sync.gpuStart.record(sync.mainStream); + + // When multiple streams are used, trtexec can run inference in two modes: + // (1) if inference.threads is true, then run each stream on each thread. + // (2) if inference.threads is false, then run all streams on the same thread. + int32_t const numThreads = inference.threads ? inference.infStreams : 1; + int32_t const streamsPerThread = inference.threads ? 1 : inference.infStreams; + + std::vector threads; + for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx) + { + threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx, streamsPerThread, device, trace)); + } + for (auto& th : threads) + { + th.join(); + } + + cudaCheck(cudaProfilerStop()); + + auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; + std::sort(trace.begin(), trace.end(), cmpTrace); + + return !iEnv.error; +} + +bool runMultiTasksInference(std::vector>& tEnvList) +{ + cudaCheck(cudaProfilerStart()); + cudaSetDeviceFlags(cudaDeviceScheduleSpin); + + SyncStruct sync; + sync.sleep = 0; + sync.mainStream.sleep(&sync.sleep); + sync.cpuStart = getCurrentTime(); + sync.gpuStart.record(sync.mainStream); + + std::vector threads; + for (size_t i = 0; i < tEnvList.size(); ++i) + { + auto& tEnv = tEnvList[i]; + threads.emplace_back(makeThread( + tEnv->iOptions, *(tEnv->iEnv), sync, /*threadIdx*/ 0, /*streamsPerThread*/ 1, tEnv->device, tEnv->trace)); + } + for (auto& th : threads) + { + th.join(); + } + + cudaCheck(cudaProfilerStop()); + + auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; + for (auto& tEnv : tEnvList) + { + std::sort(tEnv->trace.begin(), tEnv->trace.end(), cmpTrace); + } + + return std::none_of(tEnvList.begin(), tEnvList.end(), + [](std::unique_ptr& tEnv) { return tEnv->iEnv->error; }); +} + +namespace +{ +size_t reportGpuMemory() +{ + static size_t prevFree{0}; + size_t free{0}; + size_t total{0}; + size_t newlyAllocated{0}; + cudaCheck(cudaMemGetInfo(&free, &total)); + sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB"; + if (prevFree != 0) + { + newlyAllocated = (prevFree - free); + sample::gLogInfo << ", newly allocated GPU memory = " << newlyAllocated / 1024.0_MiB << " GiB"; + } + sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB" << std::endl; + prevFree = free; + return newlyAllocated; +} +} // namespace + +//! Returns true if deserialization is slower than expected or fails. +bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys) +{ + constexpr int32_t kNB_ITERS{20}; + std::unique_ptr rt{createRuntime()}; + std::unique_ptr engine; + + SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); + + auto timeDeserializeFn = [&]() -> float { + bool deserializeOK{false}; + engine.reset(nullptr); + auto startClock = std::chrono::high_resolution_clock::now(); + SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); + + auto& reader = iEnv.engine.getFileReader(); + reader.reset(); + ASSERT(reader.isOpen()); +#if !TRT_WINML + for (auto const& pluginPath : sys.dynamicPlugins) + { + rt->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } +#endif + engine.reset(rt->deserializeCudaEngine(reader)); + deserializeOK = (engine != nullptr); + auto endClock = std::chrono::high_resolution_clock::now(); + // return NAN if deserialization failed. + return deserializeOK ? std::chrono::duration(endClock - startClock).count() : NAN; + }; + + // Warmup the caches to make sure that cache thrashing isn't throwing off the results + { + sample::gLogInfo << "Begin deserialization warmup..." << std::endl; + for (int32_t i = 0, e = 2; i < e; ++i) + { + timeDeserializeFn(); + } + } + sample::gLogInfo << "Begin deserialization engine timing..." << std::endl; + float const first = timeDeserializeFn(); + + // Check if first deserialization succeeded. + if (std::isnan(first)) + { + sample::gLogError << "Engine deserialization failed." << std::endl; + return true; + } + + sample::gLogInfo << "First deserialization time = " << first << " milliseconds" << std::endl; + + // Record initial gpu memory state. + reportGpuMemory(); + + float totalTime{0.F}; + for (int32_t i = 0; i < kNB_ITERS; ++i) + { + totalTime += timeDeserializeFn(); + } + auto const averageTime = totalTime / kNB_ITERS; + // reportGpuMemory sometimes reports zero after a single deserialization of a small engine, + // so use the size of memory for all the iterations. + auto const totalEngineSizeGpu = reportGpuMemory(); + sample::gLogInfo << "Total deserialization time = " << totalTime << " milliseconds in " << kNB_ITERS + << " iterations, average time = " << averageTime << " milliseconds, first time = " << first + << " milliseconds." << std::endl; + sample::gLogInfo << "Deserialization Bandwidth = " << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s" << std::endl; + + // If the first deserialization is more than tolerance slower than + // the average deserialization, return true, which means an error occurred. + // The tolerance is set to 2x since the deserialization time is quick and susceptible + // to caching issues causing problems in the first timing. + auto const tolerance = 2.0F; + bool const isSlowerThanExpected = first > averageTime * tolerance; + if (isSlowerThanExpected) + { + sample::gLogInfo << "First deserialization time divided by average time is " << (first / averageTime) + << ". Exceeds tolerance of " << tolerance << "x." << std::endl; + } + return isSlowerThanExpected; +} + +std::string getLayerInformation( + nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format) +{ + auto runtime = std::unique_ptr{createRuntime()}; + auto inspector = std::unique_ptr(engine->createEngineInspector()); + if (context != nullptr) + { + inspector->setExecutionContext(context); + } + std::string result = inspector->getEngineInformation(format); + return result; +} + +void Binding::fill(std::string const& fileName) +{ + loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); +} + +void Binding::fill() +{ + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + fillBuffer(buffer->getHostBuffer(), volume, 0, 1); + break; + } + case nvinfer1::DataType::kINT32: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kINT64: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kINT8: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kFLOAT: + { + fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kHALF: + { + fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kBF16: + { + fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kUINT8: + { + fillBuffer(buffer->getHostBuffer(), volume, 0, 255); + break; + } + case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported"); + case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported"); + } +} + +void Binding::dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv, + std::string const separator /*= " "*/) const +{ + void* outputBuffer{}; + if (outputAllocator != nullptr) + { + outputBuffer = outputAllocator->getBuffer()->getHostBuffer(); + // Overwrite dimensions with those reported by the output allocator. + dims = outputAllocator->getFinalDims(); + os << "Final shape is " << dims << " reported by the output allocator." << std::endl; + } + else + { + outputBuffer = buffer->getHostBuffer(); + } + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT32: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT8: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kFLOAT: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kHALF: + { + dumpBuffer<__half>(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kBF16: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kUINT8: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT64: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported"); + case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported"); + } +} + +void Bindings::addBinding(TensorInfo const& tensorInfo, std::string const& fileName /*= ""*/) +{ + auto const b = tensorInfo.bindingIndex; + while (mBindings.size() <= static_cast(b)) + { + mBindings.emplace_back(); + mDevicePointers.emplace_back(); + } + mNames[tensorInfo.name] = b; + mBindings[b].isInput = tensorInfo.isInput; + mBindings[b].volume = tensorInfo.vol; + mBindings[b].dataType = tensorInfo.dataType; + if (tensorInfo.isDynamic) + { + ASSERT(!tensorInfo.isInput); // Only output shape can be possibly unknown because of DDS. + if (mBindings[b].outputAllocator == nullptr) + { + if (mUseManaged) + { + mBindings[b].outputAllocator.reset(new OutputAllocator(new UnifiedMirroredBuffer)); + } + else + { + mBindings[b].outputAllocator.reset(new OutputAllocator(new DiscreteMirroredBuffer)); + } + } + } + else + { + if (mBindings[b].buffer == nullptr) + { + if (mUseManaged) + { + mBindings[b].buffer.reset(new UnifiedMirroredBuffer); + } + else + { + mBindings[b].buffer.reset(new DiscreteMirroredBuffer); + } + } + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + if (tensorInfo.vol == 0) + { + mBindings[b].buffer->allocate(1); + } + else + { + mBindings[b].buffer->allocate( + static_cast(tensorInfo.vol) * static_cast(dataTypeSize(tensorInfo.dataType))); + } + mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); + } + if (tensorInfo.isInput) + { + if (fileName.empty()) + { + fill(b); + } + else + { + fill(b, fileName); + } + } +} + +void** Bindings::getDeviceBuffers() +{ + return mDevicePointers.data(); +} + +void Bindings::transferInputToDevice(TrtCudaStream& stream) +{ + for (auto& b : mNames) + { + if (mBindings[b.second].isInput) + { + mBindings[b.second].buffer->hostToDevice(stream); + } + } +} + +void Bindings::transferOutputToHost(TrtCudaStream& stream) +{ + for (auto& b : mNames) + { + if (!mBindings[b.second].isInput) + { + if (mBindings[b.second].outputAllocator != nullptr) + { + mBindings[b.second].outputAllocator->getBuffer()->deviceToHost(stream); + } + else + { + mBindings[b.second].buffer->deviceToHost(stream); + } + } + } +} + +void Bindings::dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os, + std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const +{ + auto const tensorName = context.getEngine().getIOTensorName(binding); + Dims dims = context.getTensorShape(tensorName); + Dims strides = context.getTensorStrides(tensorName); + int32_t vectorDim = context.getEngine().getTensorVectorizedDim(tensorName); + int32_t const spv = context.getEngine().getTensorComponentsPerElement(tensorName); + + mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); +} + +namespace +{ + +std::string genFilenameSafeString(std::string const& s) +{ + std::string res = s; + static std::string const allowedSpecialChars{"._-,"}; + for (auto& c : res) + { + if (!isalnum(c) && allowedSpecialChars.find(c) == std::string::npos) + { + c = '_'; + } + } + return res; +} + +Dims getBindingDimensions(nvinfer1::IExecutionContext const& context, std::string const& name) +{ + return context.getTensorShape(name.c_str()); +} +} // namespace + +void Bindings::dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const +{ + os << "Dumping I/O Bindings to RAW Files:" << std::endl; + for (auto const& n : mNames) + { + auto name = n.first; + auto bIndex = n.second; + auto const& binding = mBindings[bIndex]; + void* outputBuffer{}; + if (binding.outputAllocator != nullptr) + { + outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer(); + } + else + { + outputBuffer = binding.buffer->getHostBuffer(); + } + + Dims dims = getBindingDimensions(context, name); + std::string dimsStr; + std::string dotStr; + + for (int32_t i = 0; i < dims.nbDims; i++) + { + dimsStr += dotStr + std::to_string(dims.d[i]); + dotStr = "."; + } + + std::string const bindingTypeStr = (binding.isInput ? "input" : "output"); + + std::stringstream fileName; + fileName << genFilenameSafeString(name) << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType + << ".raw"; + + os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType + << " and dimensions " << dimsStr << ") to " << fileName.str() << std::endl; + + std::ofstream f(fileName.str(), std::ios::out | std::ios::binary); + ASSERT(f && "Cannot open file for write"); + f.write(static_cast(outputBuffer), binding.volume * samplesCommon::elementSize(binding.dataType)); + f.close(); + } +} + +void Bindings::dumpBindingDimensions( + std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const +{ + auto const dims = context.getTensorShape(name.c_str()); + // Do not add a newline terminator, because the caller may be outputting a JSON string. + os << dims; +} + +std::unordered_map Bindings::getBindings(std::function predicate) const +{ + std::unordered_map bindings; + for (auto const& n : mNames) + { + auto const binding = n.second; + if (predicate(mBindings[binding])) + { + bindings.insert(n); + } + } + return bindings; +} + +bool Bindings::setTensorAddresses(nvinfer1::IExecutionContext& context) const +{ + for (auto const& b : mNames) + { + auto const name = b.first.c_str(); + auto const location = context.getEngine().getTensorLocation(name); + if (location == TensorLocation::kDEVICE) + { + if (mBindings[b.second].outputAllocator != nullptr) + { + if (!context.setOutputAllocator(name, mBindings[b.second].outputAllocator.get())) + { + return false; + } + } + else + { + if (!context.setTensorAddress(name, mDevicePointers[b.second])) + { + return false; + } + } + } + } + return true; +} + +bool DebugTensorWriter::processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type, + nvinfer1::Dims const& shape, char const* name, cudaStream_t stream) +{ + CHECK(cudaStreamSynchronize(stream)); + // Store data from callback. + int64_t size = std::accumulate(shape.d, shape.d + shape.nbDims, 1LL, std::multiplies{}) + * samplesCommon::elementSize(type); + std::vector hostDataOut(size, 0); + CHECK(cudaMemcpy(hostDataOut.data(), addr, size, cudaMemcpyDeviceToHost)); + + auto it = mDebugTensorFileNames.find(name); + ASSERT(it != mDebugTensorFileNames.end()); + std::string fileName = it->second; + + std::ofstream f(fileName, std::ios::out | std::ios::binary); + ASSERT(f && "Cannot open file for write"); + sample::gLogInfo << "Writing to file " << fileName << " for debug tensor " << name << std::endl; + f.write(hostDataOut.data(), size); + f.close(); + + CHECK(cudaStreamSynchronize(stream)); + return true; +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.h b/src/Detector/tensorrt_yolo/common/sampleInference.h index 1c21f592..d9ebed92 100644 --- a/src/Detector/tensorrt_yolo/common/sampleInference.h +++ b/src/Detector/tensorrt_yolo/common/sampleInference.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,76 +18,243 @@ #ifndef TRT_SAMPLE_INFERENCE_H #define TRT_SAMPLE_INFERENCE_H +#include "sampleDevice.h" +#include "sampleEngines.h" #include "sampleReporting.h" #include "sampleUtils.h" +#include #include +#include #include #include #include -#include "NvInfer.h" +namespace sample +{ -#if (NV_TENSORRT_MAJOR > 7) +// IDebugListener class for writing debug tensors to output file. +class DebugTensorWriter : public nvinfer1::IDebugListener +{ +public: + DebugTensorWriter(std::unordered_map fileNames) + : mDebugTensorFileNames(fileNames) + { + } -#include "NvInferSafeRuntime.h" + bool processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type, + nvinfer1::Dims const& shape, char const* name, cudaStream_t stream) override; -namespace sample -{ +private: + std::unordered_map mDebugTensorFileNames; +}; struct InferenceEnvironment { - TrtUniquePtr engine; + InferenceEnvironment() = delete; + InferenceEnvironment(InferenceEnvironment const& other) = delete; + InferenceEnvironment(InferenceEnvironment&& other) = delete; + InferenceEnvironment(BuildEnvironment& bEnv) : engine(std::move(bEnv.engine)), safe(bEnv.engine.isSafe()) + { + } + + LazilyDeserializedEngine engine; std::unique_ptr profiler; - std::vector> context; + std::vector> contexts; + std::vector + deviceMemory; //< Device memory used for inference when the allocation strategy is not static. std::vector> bindings; + std::unique_ptr listener; bool error{false}; - std::vector engineBlob; - bool safe{false}; - std::unique_ptr safeEngine; - std::vector> safeContext; - template - inline ContextType* getContext(int32_t streamIdx); + inline nvinfer1::IExecutionContext* getContext(int32_t streamIdx); + + //! Storage for input shape tensors. + //! + //! It's important that the addresses of the data do not change between the calls to + //! setTensorAddress/setInputShape (which tells TensorRT where the input shape tensor is) + //! and enqueueV3 (when TensorRT might use the input shape tensor). + //! + //! The input shape tensors could alternatively be handled via member bindings, + //! but it simplifies control-flow to store the data here since it's shared across + //! the bindings. + std::list> inputShapeTensorValues; }; -template <> inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) { - return context[streamIdx].get(); -} - -template <> -inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) -{ - return safeContext[streamIdx].get(); + return contexts[streamIdx].get(); } //! //! \brief Set up contexts and bindings for inference //! -bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference); +bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system); //! //! \brief Deserialize the engine and time how long it takes. //! -bool timeDeserialize(InferenceEnvironment& iEnv); +bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys); //! //! \brief Run inference and collect timing, return false if any error hit during inference //! bool runInference( - const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace); + InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace); //! //! \brief Get layer information of the engine. //! -std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format); +std::string getLayerInformation( + nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format); -} // namespace sample +struct Binding +{ + bool isInput{false}; + std::unique_ptr buffer; + std::unique_ptr outputAllocator; + int64_t volume{0}; + nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; + + void fill(std::string const& fileName); + + void fill(); + + void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv, + std::string const separator = " ") const; +}; + +struct TensorInfo +{ + int32_t bindingIndex{-1}; + char const* name{nullptr}; + nvinfer1::Dims dims{}; + bool isDynamic{}; + int32_t comps{-1}; + nvinfer1::Dims strides{}; + int32_t vectorDimIndex{-1}; + bool isInput{}; + nvinfer1::DataType dataType{}; + int64_t vol{-1}; + + void updateVolume(int32_t batch) + { + vol = volume(dims, strides, vectorDimIndex, comps, batch); + } +}; + +class Bindings +{ +public: + Bindings() = delete; + explicit Bindings(bool useManaged) + : mUseManaged(useManaged) + { + } + + void addBinding(TensorInfo const& tensorInfo, std::string const& fileName = ""); -#endif + void** getDeviceBuffers(); + + void transferInputToDevice(TrtCudaStream& stream); + + void transferOutputToHost(TrtCudaStream& stream); + + void fill(int binding, std::string const& fileName) + { + mBindings[binding].fill(fileName); + } + + void fill(int binding) + { + mBindings[binding].fill(); + } + + void dumpBindingDimensions( + std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const; + + void dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os, + std::string const& separator = " ", int32_t batch = 1) const; + + void dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const; + + void dumpInputs(nvinfer1::IExecutionContext const& context, std::ostream& os) const + { + auto isInput = [](Binding const& b) { return b.isInput; }; + dumpBindings(context, isInput, os); + } + + void dumpOutputs(nvinfer1::IExecutionContext const& context, std::ostream& os) const; + + void dumpBindings(nvinfer1::IExecutionContext const& context, std::ostream& os) const + { + auto all = [](Binding const& b) { return true; }; + dumpBindings(context, all, os); + } + + void dumpBindings(nvinfer1::IExecutionContext const& context, std::function predicate, + std::ostream& os) const + { + for (auto const& n : mNames) + { + auto const name = n.first; + auto const binding = n.second; + if (predicate(mBindings[binding])) + { + os << n.first << ": ("; + dumpBindingDimensions(name, context, os); + os << ")" << std::endl; + + dumpBindingValues(context, binding, os); + os << std::endl; + } + } + } + + std::unordered_map getInputBindings() const + { + auto isInput = [](Binding const& b) { return b.isInput; }; + return getBindings(isInput); + } + + std::unordered_map getOutputBindings() const + { + auto isOutput = [](Binding const& b) { return !b.isInput; }; + return getBindings(isOutput); + } + + std::unordered_map getBindings() const + { + auto all = [](Binding const& b) { return true; }; + return getBindings(all); + } + + std::unordered_map getBindings(std::function predicate) const; + + bool setTensorAddresses(nvinfer1::IExecutionContext& context) const; + +private: + std::unordered_map mNames; + std::vector mBindings; + std::vector mDevicePointers; + bool mUseManaged{false}; +}; + +struct TaskInferenceEnvironment +{ + TaskInferenceEnvironment(std::string engineFile, InferenceOptions inference, int32_t deviceId = 0, + int32_t DLACore = -1, int32_t bs = batchNotProvided); + InferenceOptions iOptions{}; + int32_t device{defaultDevice}; + int32_t batch{batchNotProvided}; + std::unique_ptr iEnv; + std::vector trace; +}; + +bool runMultiTasksInference(std::vector>& tEnvList); + +} // namespace sample #endif // TRT_SAMPLE_INFERENCE_H diff --git a/src/Detector/tensorrt_yolo/common/sampleOptions.cpp b/src/Detector/tensorrt_yolo/common/sampleOptions.cpp index 0afd163f..bdb1b21c 100644 --- a/src/Detector/tensorrt_yolo/common/sampleOptions.cpp +++ b/src/Detector/tensorrt_yolo/common/sampleOptions.cpp @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -19,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -27,29 +29,64 @@ #include "logger.h" #include "sampleOptions.h" - +#include "sampleUtils.h" +using namespace nvinfer1; namespace sample { namespace { -std::vector splitToStringVec(const std::string& option, char separator) +static const std::map> kUNIT_MULTIPLIERS{ + {'B', {1, "Bytes"}}, + {'K', {1 << 10, "Kibibytes"}}, + {'M', {1 << 20, "Mebibytes"}}, + {'G', {1 << 30, "Gibibytes"}}, +}; + +std::string addDefaultUnitSuffixIfNotSpecified(std::string const& option, char defaultUnit) { - std::vector options; + char lastChar = option.at(option.size() - 1); + return std::isdigit(lastChar) ? option + defaultUnit : option; +} - for (size_t start = 0; start < option.length();) +// Returns "B (Bytes), K (Kilobytes), ..." +std::string getAvailableUnitSuffixes() +{ + std::ostringstream ss; + for (auto it = kUNIT_MULTIPLIERS.begin(); it != kUNIT_MULTIPLIERS.end(); ++it) { - size_t separatorIndex = option.find(separator, start); - if (separatorIndex == std::string::npos) + if (it != kUNIT_MULTIPLIERS.begin()) { - separatorIndex = option.length(); + ss << ", "; } - options.emplace_back(option.substr(start, separatorIndex - start)); - start = separatorIndex + 1; + ss << it->first << " (" << it->second.second << ")"; } + return ss.str(); +} - return options; +// Numeric trtexec arguments can have unit specifiers in similar to polygraphy. +// E.g. --weightStreamingBudget=20M would be 20 Mebibytes (base 2). +int64_t getUnitMultiplier(std::string const& option) +{ + char lastChar = option.at(option.size() - 1); + if (!std::isdigit(lastChar)) + { + char unit = std::toupper(lastChar); + auto found = kUNIT_MULTIPLIERS.find(unit); + if (found == kUNIT_MULTIPLIERS.end()) + { + std::ostringstream ss; + ss << "Error parsing \"" << option << "\": invalid unit specifier '" << unit + << "'. Valid base-2 unit suffixes include: "; + ss << getAvailableUnitSuffixes() << "."; + throw std::invalid_argument(ss.str()); + } + return found->second.first; + } + + // Return bytes by default + return kUNIT_MULTIPLIERS.at('B').first; } template @@ -64,6 +101,12 @@ int32_t stringToValue(const std::string& option) return std::stoi(option); } +template <> +size_t stringToValue(const std::string& option) +{ + return std::stoi(option) * getUnitMultiplier(option); +} + template <> float stringToValue(const std::string& option) { @@ -73,7 +116,7 @@ float stringToValue(const std::string& option) template <> double stringToValue(const std::string& option) { - return std::stod(option); + return std::stod(option) * getUnitMultiplier(option); } template <> @@ -86,6 +129,10 @@ template <> std::vector stringToValue>(const std::string& option) { std::vector shape; + if (option == "scalar") + { + return shape; + } std::vector dimsStrings = splitToStringVec(option, 'x'); for (const auto& d : dimsStrings) { @@ -98,8 +145,9 @@ template <> nvinfer1::DataType stringToValue(const std::string& option) { const std::unordered_map strToDT{{"fp32", nvinfer1::DataType::kFLOAT}, - {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8}, - {"int32", nvinfer1::DataType::kINT32}}; + {"fp16", nvinfer1::DataType::kHALF}, {"bf16", nvinfer1::DataType::kBF16}, {"int8", nvinfer1::DataType::kINT8}, + {"fp8", nvinfer1::DataType::kFP8}, {"int32", nvinfer1::DataType::kINT32}, {"int64", nvinfer1::DataType::kINT64}, + {"bool", nvinfer1::DataType::kBOOL}, {"uint8", nvinfer1::DataType::kUINT8}, {"int4", nvinfer1::DataType::kINT4}}; const auto& dt = strToDT.find(option); if (dt == strToDT.end()) { @@ -108,6 +156,21 @@ nvinfer1::DataType stringToValue(const std::string& option) return dt->second; } +template <> +nvinfer1::DeviceType stringToValue(std::string const& option) +{ + std::unordered_map const strToDevice = { + {"GPU", nvinfer1::DeviceType::kGPU}, + {"DLA", nvinfer1::DeviceType::kDLA}, + }; + auto const& device = strToDevice.find(option); + if (device == strToDevice.end()) + { + throw std::invalid_argument("Invalid Device Type " + option); + } + return device->second; +} + template <> nvinfer1::TensorFormats stringToValue(const std::string& option) { @@ -116,7 +179,8 @@ nvinfer1::TensorFormats stringToValue(const std::string {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4}, {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16}, {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8}, - {"hwc", nvinfer1::TensorFormat::kHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, + {"cdhw32", nvinfer1::TensorFormat::kCDHW32}, {"hwc", nvinfer1::TensorFormat::kHWC}, + {"dhwc", nvinfer1::TensorFormat::kDHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}}; nvinfer1::TensorFormats formats{}; for (auto f : optionStrings) @@ -149,11 +213,82 @@ IOFormat stringToValue(const std::string& option) return ioFormat; } +template <> +SparsityFlag stringToValue(std::string const& option) +{ + std::unordered_map const table{ + {"disable", SparsityFlag::kDISABLE}, {"enable", SparsityFlag::kENABLE}, {"force", SparsityFlag::kFORCE}}; + auto search = table.find(option); + if (search == table.end()) + { + throw std::invalid_argument(std::string("Unknown sparsity mode: ") + option); + } + if (search->second == SparsityFlag::kFORCE) + { + sample::gLogWarning << "--sparsity=force has been deprecated. " + << "Please use to rewrite the weights to a sparsity pattern " + << "and then run with --sparsity=enable" << std::endl; + } + + return search->second; +} + +template <> +WeightStreamingBudget stringToValue(std::string const& option) +{ + WeightStreamingBudget budget; + if (option.find('%') != std::string::npos) + { + double percent = std::stod(option); + if (!(percent >= 0 && percent <= 100.0)) + { + std::ostringstream err; + err << "The weight streaming percent must be between 0 and 100."; + throw std::invalid_argument(err.str()); + } + budget.percent = percent; + } + else + { + double bytes = stringToValue(option); + if (!(bytes == WeightStreamingBudget::kAUTOMATIC || bytes == WeightStreamingBudget::kDISABLE || bytes >= 0)) + { + std::ostringstream err; + err << "The weight streaming budget must be " << WeightStreamingBudget::kDISABLE << ", " + << WeightStreamingBudget::kAUTOMATIC << ", or at least 0."; + throw std::invalid_argument(err.str()); + } + budget.bytes = static_cast(bytes); + } + return budget; +} + template std::pair splitNameAndValue(const std::string& s) { std::string tensorName; std::string valueString; + + // Support 'inputName':Path format for --loadInputs flag when dealing with Windows paths. + // i.e. 'inputName':c:\inputData + std::vector quoteNameRange{ splitToStringVec(s, '\'') }; + // splitToStringVec returns the entire string when delimiter is not found, so it's size is always at least 1 + if (quoteNameRange.size() != 1) + { + if (quoteNameRange.size() != 3) + { + std::string errorMsg = std::string("Found invalid number of \'s when parsing ") + s + + std::string(". Expected: 2, received: ") + std::to_string(quoteNameRange.size() -1) + + ". Please ensure that a singular comma is used within each comma-separated key-value pair for options like --inputIOFormats, --optShapes, --optShapesCalib, --layerPrecisions, etc."; + throw std::invalid_argument(errorMsg); + } + // Everything before the second "'" is the name. + tensorName = quoteNameRange[0] + quoteNameRange[1]; + // Path is the last string - ignoring leading ":" so slice it with [1:] + valueString = quoteNameRange[2].substr(1); + return std::pair(tensorName, stringToValue(valueString)); + } + // Split on the last : std::vector nameRange{splitToStringVec(s, ':')}; // Everything before the last : is the name @@ -181,16 +316,71 @@ const char* boolToEnabled(bool enable) return enable ? "Enabled" : "Disabled"; } +//! A helper function similar to sep.join(list) in Python. +template +std::string joinValuesToString(std::vector const& list, std::string const& sep) +{ + std::ostringstream os; + for (int32_t i = 0, n = list.size(); i < n; ++i) + { + os << list[i]; + if (i != n - 1) + { + os << sep; + } + } + return os.str(); +} + +template +std::string joinValuesToString(std::array const& list, std::string const& sep) +{ + return joinValuesToString(std::vector(list.begin(), list.end()), sep); +} + //! Check if input option exists in input arguments. -//! If it does: return its value, erase the argument and return true. +//! If it does: set its value, and return true //! If it does not: return false. template -bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) +bool getOption(Arguments& arguments, const std::string& option, T& value) { - const auto match = arguments.find(option); + auto const match = arguments.find(option); if (match != arguments.end()) { - value = stringToValue(match->second); + value = stringToValue(match->second.first); + return true; + } + + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: set its value, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOption(Arguments& arguments, const std::string& option, T_& value) +{ + bool found = getOption(arguments, option, value); + if (found) + { + const auto match = arguments.find(option); + arguments.erase(match); + } + + return found; +} + +//! Check if input option exists in input arguments. +//! If it does: set its value and position, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOptionWithPosition(Arguments& arguments, std::string const& option, T_& value, int32_t& pos) +{ + auto const match = arguments.find(option); + if (match != arguments.end()) + { + value = stringToValue(match->second.first); + pos = match->second.second; arguments.erase(match); return true; } @@ -198,8 +388,31 @@ bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) return false; } +//! Check if input option exists in input arguments behind the position spcecified by pos. +//! If it does: set its value, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOptionBehind(Arguments& arguments, std::string const& option, int32_t pos, T_& value) +{ + auto const match = arguments.equal_range(option); + if (match.first == match.second) + { + return false; + } + for (auto i = match.first; i != match.second; ++i) + { + if (i->second.second - pos == 1) + { + value = stringToValue(i->second.first); + arguments.erase(i); + return true; + } + } + return false; +} + //! Check if input option exists in input arguments. -//! If it does: return false in value, erase the argument and return true. +//! If it does: set false in value, erase the argument and return true. //! If it does not: return false. bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value) { @@ -224,34 +437,37 @@ bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, st return false; } - auto addToValues = [&values](Arguments::value_type& argValue) {values.emplace_back(stringToValue(argValue.second));}; + auto addToValues + = [&values](Arguments::value_type& argValue) { values.emplace_back(stringToValue(argValue.second.first)); }; std::for_each(match.first, match.second, addToValues); arguments.erase(match.first, match.second); return true; } -void insertShapesBuild(std::unordered_map& shapes, nvinfer1::OptProfileSelector selector, const std::string& name, const std::vector& dims) +void insertShapesBuild(BuildOptions::ShapeProfile& shapes, nvinfer1::OptProfileSelector selector, + const std::string& name, const std::vector& dims) { shapes[name][static_cast(selector)] = dims; } -void insertShapesInference(std::unordered_map>& shapes, const std::string& name, const std::vector& dims) +void insertShapesInference( + InferenceOptions::ShapeProfile& shapes, std::string const& name, std::vector const& dims) { shapes[name] = dims; } std::string removeSingleQuotationMarks(std::string& str) { - std::vector strList{splitToStringVec(str, '\'')}; - // Remove all the escaped single quotation marks - std::string retVal = ""; - // Do not really care about unterminated sequences - for (size_t i = 0; i < strList.size(); i++) - { - retVal += strList[i]; - } - return retVal; + std::vector strList{splitToStringVec(str, '\'')}; + // Remove all the escaped single quotation marks + std::string retVal; + // Do not really care about unterminated sequences + for (size_t i = 0; i < strList.size(); i++) + { + retVal += strList[i]; + } + return retVal; } void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions) @@ -293,7 +509,41 @@ void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutput } } -bool getShapesBuild(Arguments& arguments, std::unordered_map& shapes, char const* argument, +void getLayerDeviceTypes(Arguments& arguments, char const* argument, LayerDeviceTypes& layerDeviceTypes) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerDeviceTypes flag contains comma-separated layerName:deviceType pairs. + std::vector deviceList{splitToStringVec(list, ',')}; + for (auto const& s : deviceList) + { + auto nameDevicePair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(nameDevicePair.first); + layerDeviceTypes[layerName] = stringToValue(nameDevicePair.second); + } +} + +void getStringsSet(Arguments& arguments, char const* argument, StringSet& stringSet) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerPrecisions flag contains comma-separated layerName:precision pairs. + std::vector strings{splitToStringVec(list, ',')}; + for (auto const& s : strings) + { + stringSet.insert(s); + } +} + +bool getShapesBuild(Arguments& arguments, BuildOptions::ShapeProfile& shapes, char const* argument, nvinfer1::OptProfileSelector selector) { std::string list; @@ -309,7 +559,7 @@ bool getShapesBuild(Arguments& arguments, std::unordered_map>& shapes, const char* argument) +bool getShapesInference(Arguments& arguments, InferenceOptions::ShapeProfile& shapes, const char* argument) { std::string list; bool retVal = getAndDelOption(arguments, argument, list); @@ -324,67 +574,195 @@ bool getShapesInference(Arguments& arguments, std::unordered_map& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib) +void fillShapes(BuildOptions::ShapeProfile& shapes, std::string const& name, ShapeRange const& sourceShapeRange, + nvinfer1::OptProfileSelector minDimsSource, nvinfer1::OptProfileSelector optDimsSource, + nvinfer1::OptProfileSelector maxDimsSource) { - // Only accept optShapes only or all three of minShapes, optShapes, maxShapes - if ( ((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes only, both minShapes and maxShapes + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kMIN, name, sourceShapeRange[static_cast(minDimsSource)]); + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kOPT, name, sourceShapeRange[static_cast(optDimsSource)]); + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kMAX, name, sourceShapeRange[static_cast(maxDimsSource)]); +} + +void processShapes(BuildOptions::ShapeProfile& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib) +{ + // Only accept optShapes only or all three of minShapes, optShapes, maxShapes when calib is set + if (((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes only, both minShapes and maxShapes || (minShapes && !maxShapes && optShapes) // both minShapes and optShapes || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes { if (calib) { - throw std::invalid_argument("Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib"); - } - else - { - throw std::invalid_argument("Must specify only --optShapes or all of --minShapes, --optShapes, --maxShapes"); + throw std::invalid_argument( + "Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib"); } } - // If optShapes only, expand optShapes to minShapes and maxShapes - if (optShapes && !minShapes && !maxShapes) + if (!minShapes && !optShapes && !maxShapes) { - std::unordered_map newShapes; - for (auto& s : shapes) + return; + } + + BuildOptions::ShapeProfile newShapes; + for (auto& s : shapes) + { + nvinfer1::OptProfileSelector minDimsSource, optDimsSource, maxDimsSource; + minDimsSource = nvinfer1::OptProfileSelector::kMIN; + optDimsSource = nvinfer1::OptProfileSelector::kOPT; + maxDimsSource = nvinfer1::OptProfileSelector::kMAX; + + // Populate missing minShapes + if (!minShapes) + { + if (optShapes) + { + minDimsSource = optDimsSource; + sample::gLogWarning << "optShapes is being broadcasted to minShapes for tensor " << s.first + << std::endl; + } + else + { + minDimsSource = maxDimsSource; + sample::gLogWarning << "maxShapes is being broadcasted to minShapes for tensor " << s.first + << std::endl; + } + } + + // Populate missing optShapes + if (!optShapes) { - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + if (maxShapes) + { + optDimsSource = maxDimsSource; + sample::gLogWarning << "maxShapes is being broadcasted to optShapes for tensor " << s.first + << std::endl; + } + else + { + optDimsSource = minDimsSource; + sample::gLogWarning << "minShapes is being broadcasted to optShapes for tensor " << s.first + << std::endl; + } + } + + // Populate missing maxShapes + if (!maxShapes) + { + if (optShapes) + { + maxDimsSource = optDimsSource; + sample::gLogWarning << "optShapes is being broadcasted to maxShapes for tensor " << s.first + << std::endl; + } + else + { + maxDimsSource = minDimsSource; + sample::gLogWarning << "minShapes is being broadcasted to maxShapes for tensor " << s.first + << std::endl; + } } - shapes = newShapes; + + fillShapes(newShapes, s.first, s.second, minDimsSource, optDimsSource, maxDimsSource); } + shapes = newShapes; } -template -void printShapes(std::ostream& os, const char* phase, const T& shapes) +bool getOptimizationProfiles( + Arguments& arguments, std::vector& optProfiles, char const* argument) { - if (shapes.empty()) + bool retValue{false}; + int32_t pos{}; + size_t profileIndex{}; + + auto getShapes + = [](BuildOptions::ShapeProfile& shapes, std::string const& list, nvinfer1::OptProfileSelector selector) { + std::vector shapeList{splitToStringVec(list, ',')}; + for (auto const& s : shapeList) + { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesBuild(shapes, selector, tensorName, dims); + } + }; + + while (getAndDelOptionWithPosition(arguments, argument, profileIndex, pos)) { - os << "Input " << phase << " shapes: model" << std::endl; + BuildOptions::ShapeProfile optProfile{}; + bool minShapes{false}, maxShapes{false}, optShapes{false}; + for (int32_t i = 0; i < nvinfer1::EnumMax(); i++, pos++) + { + std::string value; + + if (!minShapes && getAndDelOptionBehind(arguments, "--minShapes", pos, value)) + { + minShapes = true; + getShapes(optProfile, value, nvinfer1::OptProfileSelector::kMIN); + } + else if (!maxShapes && getAndDelOptionBehind(arguments, "--maxShapes", pos, value)) + { + maxShapes = true; + getShapes(optProfile, value, nvinfer1::OptProfileSelector::kMAX); + } + else if (!optShapes && getAndDelOptionBehind(arguments, "--optShapes", pos, value)) + { + optShapes = true; + getShapes(optProfile, value, nvinfer1::OptProfileSelector::kOPT); + } + else + { + break; + } + } + processShapes(optProfile, minShapes, optShapes, maxShapes, false); + if (profileIndex >= optProfiles.size()) + { + optProfiles.resize(profileIndex + 1); + } + if (!optProfiles[profileIndex].empty()) + { + throw std::invalid_argument("Optimization profile index cannot be the same."); + } + optProfiles[profileIndex] = optProfile; + retValue = true; } - else + + profileIndex = 0; + for (auto const& optProfile : optProfiles) { - for (const auto& s : shapes) + if (optProfile.empty()) { - os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl; + throw std::invalid_argument(std::string("Found invalid or missing shape spec at profile index ") + + std::to_string(profileIndex) + std::string(". ")); } + ++profileIndex; } + return retValue; } -std::ostream& printBatch(std::ostream& os, int32_t maxBatch) +template +void printShapes(std::ostream& os, char const* phase, T const& shapes, int32_t profileIndex) { - if (maxBatch != maxBatchNotProvided) + if (shapes.empty()) { - os << maxBatch; + os << "Input " << phase << " shapes: model" << std::endl; } else { - os << "explicit batch"; + std::string profileString = (profileIndex != -1 && strcmp(phase, "build") == 0) + ? "(profile " + std::to_string(profileIndex) + ")" + : ""; + for (auto const& s : shapes) + { + os << "Input " << phase << " shape " << profileString << ": " << s.first << "=" << s.second << std::endl; + } } - return os; } -std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources) +std::ostream& printTacticSources( + std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources) { if (!enabledSources && !disabledSources) { @@ -405,24 +783,41 @@ std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabl addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS), "cublas"); addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt"); -#if (NV_TENSORRT_MAJOR > 7) addSource(1U << static_cast(nvinfer1::TacticSource::kCUDNN), "cudnn"); -#endif + addSource(1U << static_cast(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS), "edge mask convolutions"); + addSource(1U << static_cast(nvinfer1::TacticSource::kJIT_CONVOLUTIONS), "JIT convolutions"); } return os; } std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) { + if (options.stronglyTyped) + { + os << "Strongly Typed"; + return os; + } os << "FP32"; if (options.fp16) { os << "+FP16"; } + if (options.bf16) + { + os << "+BF16"; + } if (options.int8) { os << "+INT8"; } + if (options.fp8) + { + os << "+FP8"; + } + if (options.int4) + { + os << "+INT4"; + } if (options.precisionConstraints == PrecisionConstraints::kOBEY) { os << " (obey precision constraints)"; @@ -434,13 +829,27 @@ std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) return os; } -std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options) +std::ostream& printTempfileControls(std::ostream& os, TempfileControlFlags const tempfileControls) +{ + auto getFlag = [&](TempfileControlFlag f) -> char const* { + bool allowed = !!(tempfileControls & (1U << static_cast(f))); + return allowed ? "allow" : "deny"; + }; + auto const inMemory = getFlag(TempfileControlFlag::kALLOW_IN_MEMORY_FILES); + auto const temporary = getFlag(TempfileControlFlag::kALLOW_TEMPORARY_FILES); + + os << "{ in_memory: " << inMemory << ", temporary: " << temporary << " }"; + + return os; +} + +std::ostream& printTimingCache(std::ostream& os, TimingCacheMode const& timingCacheMode) { - switch (options.timingCacheMode) + switch (timingCacheMode) { - case TimingCacheMode::kGLOBAL: os << "global"; break; - case TimingCacheMode::kLOCAL: os << "local"; break; - case TimingCacheMode::kDISABLE: os << "disable"; break; + case TimingCacheMode::kGLOBAL: os << "global"; break; + case TimingCacheMode::kLOCAL: os << "local"; break; + case TimingCacheMode::kDISABLE: os << "disable"; break; } return os; } @@ -459,20 +868,67 @@ std::ostream& printSparsity(std::ostream& os, BuildOptions const& options) std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options) { - auto const printValueOrDefault = [&os](double const val) { + auto const printValueOrDefault = [&os](double const val, char const* unit = "MiB") { if (val >= 0) { - os << val << " MiB"; + os << val << " " << unit; } else { os << "default"; } }; - os << "workspace: "; printValueOrDefault(options.workspace); os << ", "; - os << "dlaSRAM: "; printValueOrDefault(options.dlaSRAM); os << ", "; - os << "dlaLocalDRAM: "; printValueOrDefault(options.dlaLocalDRAM); os << ", "; - os << "dlaGlobalDRAM: "; printValueOrDefault(options.dlaGlobalDRAM); + os << "workspace: "; + printValueOrDefault(options.workspace); + os << ", "; + os << "dlaSRAM: "; + printValueOrDefault(options.dlaSRAM); + os << ", "; + os << "dlaLocalDRAM: "; + printValueOrDefault(options.dlaLocalDRAM); + os << ", "; + os << "dlaGlobalDRAM: "; + printValueOrDefault(options.dlaGlobalDRAM); + os << ", "; + os << "tacticSharedMem: "; + printValueOrDefault(options.tacticSharedMem, "KiB"); + return os; +} + +std::string previewFeatureToString(PreviewFeature feature) +{ + // clang-format off + switch (feature) + { + case PreviewFeature::kPROFILE_SHARING_0806: + { + gLogWarning << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect." << std::endl; + break; + } + case PreviewFeature::kALIASED_PLUGIN_IO_10_03: return "kALIASED_PLUGIN_IO_10_03"; + } + return "Invalid Preview Feature"; + // clang-format on +} + +std::ostream& printPreviewFlags(std::ostream& os, BuildOptions const& options) +{ + if (options.previewFeatures.empty()) + { + os << "Use default preview flags."; + return os; + } + + auto const addFlag = [&](PreviewFeature feat) { + int32_t featVal = static_cast(feat); + if (options.previewFeatures.find(featVal) != options.previewFeatures.end()) + { + os << previewFeatureToString(feat) << (options.previewFeatures.at(featVal) ? " [ON], " : " [OFF], "); + } + }; + + addFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03); + return os; } @@ -487,51 +943,41 @@ Arguments argsToArgumentsMap(int32_t argc, char* argv[]) if (valuePtr) { std::string value{valuePtr + 1}; - arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value); + arguments.emplace(std::string(argv[i], valuePtr - argv[i]), std::make_pair(value, i)); } else { - arguments.emplace(argv[i], ""); + arguments.emplace(argv[i], std::make_pair(std::string(""), i)); } } return arguments; } -void BaseModelOptions::parse(Arguments& arguments) +namespace { - if (getAndDelOption(arguments, "--onnx", model)) - { - format = ModelFormat::kONNX; - } - else if (getAndDelOption(arguments, "--uff", model)) - { - format = ModelFormat::kUFF; - } - else if (getAndDelOption(arguments, "--model", model)) +std::string resolveHomeDirectoryOnLinux(std::string const& model) +{ + std::string filePath{model}; +#ifndef _WIN32 + if (filePath[0] == '~') { - format = ModelFormat::kCAFFE; + char const* home = std::getenv("HOME"); + if (home) + { + filePath.replace(0, 1, home); + } } +#endif + return filePath; } +} // namespace -void UffInput::parse(Arguments& arguments) +void BaseModelOptions::parse(Arguments& arguments) { - getAndDelOption(arguments, "--uffNHWC", NHWC); - std::vector args; - if (getAndDelRepeatedOption(arguments, "--uffInput", args)) + if (getAndDelOption(arguments, "--onnx", model)) { - for (const auto& i : args) - { - std::vector values{splitToStringVec(i, ',')}; - if (values.size() == 4) - { - nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])}; - inputs.emplace_back(values[0], dims); - } - else - { - throw std::invalid_argument(std::string("Invalid uffInput ") + i); - } - } + format = ModelFormat::kONNX; + model = resolveHomeDirectoryOnLinux(model); } } @@ -541,56 +987,66 @@ void ModelOptions::parse(Arguments& arguments) switch (baseModel.format) { - case ModelFormat::kCAFFE: + case ModelFormat::kONNX: + case ModelFormat::kANY: { - getAndDelOption(arguments, "--deploy", prototxt); break; } - case ModelFormat::kUFF: - { - uffInputs.parse(arguments); - if (uffInputs.inputs.empty()) - { - throw std::invalid_argument("Uff models require at least one input"); - } - break; } - case ModelFormat::kONNX: - break; - case ModelFormat::kANY: + + if (baseModel.format == ModelFormat::kONNX) { - if (getAndDelOption(arguments, "--deploy", prototxt)) + if (!outputs.empty()) { - baseModel.format = ModelFormat::kCAFFE; + throw std::invalid_argument("The --output flag should not be used with ONNX models."); } - break; } +} + +void getTempfileControls(Arguments& arguments, char const* argument, TempfileControlFlags& tempfileControls) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; } - // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX. - std::vector outArgs; - if (getAndDelRepeatedOption(arguments, "--output", outArgs)) + std::vector controlList{splitToStringVec(list, ',')}; + for (auto const& s : controlList) { - for (const auto& o : outArgs) + auto controlAllowPair = splitNameAndValue(s); + bool allowed{false}; + int32_t offset{-1}; + + if (controlAllowPair.second.compare("allow") == 0) { - for (auto& v : splitToStringVec(o, ',')) - { - outputs.emplace_back(std::move(v)); - } + allowed = true; } - } - if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF) - { - if (outputs.empty()) + else if (controlAllowPair.second.compare("deny") != 0) { - throw std::invalid_argument("Caffe and Uff models require at least one output"); + throw std::invalid_argument("--tempfileControls value should be `deny` or `allow`"); } - } - else if (baseModel.format == ModelFormat::kONNX) - { - if (!outputs.empty()) + + if (controlAllowPair.first.compare("in_memory") == 0) { - throw std::invalid_argument("The --output flag should not be used with ONNX models."); + offset = static_cast(TempfileControlFlag::kALLOW_IN_MEMORY_FILES); + } + else if (controlAllowPair.first.compare("temporary") == 0) + { + offset = static_cast(TempfileControlFlag::kALLOW_TEMPORARY_FILES); + } + else + { + throw std::invalid_argument(std::string{"Unknown --tempfileControls key "} + controlAllowPair.first); + } + + if (allowed) + { + tempfileControls |= (1U << offset); + } + else + { + tempfileControls &= ~(1U << offset); } } } @@ -610,38 +1066,59 @@ void BuildOptions::parse(Arguments& arguments) getFormats(inputFormats, "--inputIOFormats"); getFormats(outputFormats, "--outputIOFormats"); - bool addedExplicitBatchFlag{false}; - getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag); - if (addedExplicitBatchFlag) - { - sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl; - sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic " - << "shapes are provided when the engine is built." << std::endl; - } - - bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); - bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); - bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); - processShapes(shapes, minShapes, optShapes, maxShapes, false); - bool minShapesCalib - = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN); - bool optShapesCalib - = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT); - bool maxShapesCalib - = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX); - processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true); + bool getCalibProfile = getAndDelOption(arguments, "--calibProfile", calibProfile); + if (!getOptimizationProfiles(arguments, optProfiles, "--profile")) + { + ShapeProfile shapes; + bool minShapes{false}, optShapes{false}, maxShapes{false}; + try + { + minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); + optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); + maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); + } + catch (std::invalid_argument const& arg) + { + throw std::invalid_argument(arg.what() + + std::string(" conversion failure: failed to parse minShapes/optShapes/maxShapes. Please double check " + "your input string.")); + } - bool addedExplicitPrecisionFlag{false}; - getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag); - if (addedExplicitPrecisionFlag) + processShapes(shapes, minShapes, optShapes, maxShapes, false); + optProfiles.emplace_back(shapes); + } + + if (calibProfile >= optProfiles.size()) + { + throw std::invalid_argument( + std::string("--calibProfile shouldn't greater than the size of optimization profile.")); + } + + BuildOptions::ShapeProfile dummyShapes; + + bool remainingMinShapes = getShapesBuild(arguments, dummyShapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); + bool remainingOptShapes = getShapesBuild(arguments, dummyShapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); + bool remainingMaxShapes = getShapesBuild(arguments, dummyShapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); + if (remainingMinShapes || remainingOptShapes || remainingMaxShapes) { - sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl; + throw std::invalid_argument("Multiple --minShapes/--optShapes/--maxShapes without --profile are not allowed. "); } - if (getAndDelOption(arguments, "--workspace", workspace)) + bool minShapesCalib{false}, optShapesCalib{false}, maxShapesCalib{false}; + try { - sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl; + minShapesCalib = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN); + optShapesCalib = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT); + maxShapesCalib = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX); } + catch (std::invalid_argument const& arg) + { + throw std::invalid_argument(arg.what() + + std::string(" conversion failure: failed to parse minShapesCalib/optShapesCalib/maxShapesCalib. Please " + "double check your input string.")); + } + + processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true); std::string memPoolSizes; getAndDelOption(arguments, "--memPoolSize", memPoolSizes); @@ -650,26 +1127,47 @@ void BuildOptions::parse(Arguments& arguments) { std::string memPoolName; double memPoolSize; - std::tie(memPoolName, memPoolSize) = splitNameAndValue(memPoolSpec); + try + { + std::string strPoolSize; + std::tie(memPoolName, strPoolSize) = splitNameAndValue(memPoolSpec); + memPoolSize = stringToValue(addDefaultUnitSuffixIfNotSpecified(strPoolSize, 'M')); + } + catch (std::invalid_argument const& arg) + { + throw std::invalid_argument(arg.what() + + std::string( + " conversion failure: failed to parse --memPoolSize. Please double check your input string.")); + } + if (memPoolSize < 0) { throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize)); } if (memPoolName == "workspace") { - workspace = memPoolSize; + // use unit in MB. + workspace = memPoolSize / 1.0_MiB; } else if (memPoolName == "dlaSRAM") { - dlaSRAM = memPoolSize; + // use unit in MB. + dlaSRAM = memPoolSize / 1.0_MiB; } else if (memPoolName == "dlaLocalDRAM") { - dlaLocalDRAM = memPoolSize; + // use unit in MB. + dlaLocalDRAM = memPoolSize / 1.0_MiB; } else if (memPoolName == "dlaGlobalDRAM") { - dlaGlobalDRAM = memPoolSize; + // use unit in MB. + dlaGlobalDRAM = memPoolSize / 1.0_MiB; + } + else if (memPoolName == "tacticSharedMem") + { + // use unit in KB. + tacticSharedMem = memPoolSize / 1.0_KiB; } else if (!memPoolName.empty()) { @@ -677,8 +1175,6 @@ void BuildOptions::parse(Arguments& arguments) } } - getAndDelOption(arguments, "--maxBatch", maxBatch); - getAndDelOption(arguments, "--minTiming", minTiming); getAndDelOption(arguments, "--avgTiming", avgTiming); bool best{false}; @@ -687,16 +1183,79 @@ void BuildOptions::parse(Arguments& arguments) { int8 = true; fp16 = true; + + // BF16 only supported on Ampere+ + if (samplesCommon::getSMVersion() >= 0x0800) + { + bf16 = true; + } } getAndDelOption(arguments, "--refit", refittable); + + getAndDelOption(arguments, "--weightless", stripWeights); + getAndDelOption(arguments, "--stripWeights", stripWeights); + + bool stripAllWeights{}; + getAndDelOption(arguments, "--stripAllWeights", stripAllWeights); + if (stripAllWeights) + { + refittable = true; + stripWeights = true; + } + + // --vc and --versionCompatible are synonyms + getAndDelOption(arguments, "--vc", versionCompatible); + if (!versionCompatible) + { + getAndDelOption(arguments, "--versionCompatible", versionCompatible); + } + +#if !TRT_WINML + // --pi and --pluginInstanceNorm are synonyms + getAndDelOption(arguments, "--pi", pluginInstanceNorm); + if (!pluginInstanceNorm) + { + getAndDelOption(arguments, "--pluginInstanceNorm", pluginInstanceNorm); + } +#endif + + getAndDelOption(arguments, "--excludeLeanRuntime", excludeLeanRuntime); + getAndDelOption(arguments, "--noCompilationCache", disableCompilationCache); getAndDelNegOption(arguments, "--noTF32", tf32); getAndDelOption(arguments, "--fp16", fp16); + getAndDelOption(arguments, "--bf16", bf16); getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--fp8", fp8); + getAndDelOption(arguments, "--int4", int4); + getAndDelOption(arguments, "--stronglyTyped", stronglyTyped); + if (stronglyTyped) + { + auto disableAndLog = [](bool& flag, std::string mode, std::string type) { + if (flag) + { + flag = false; + sample::gLogWarning << "Invalid usage, setting " << mode + << " mode is not allowed if graph is strongly typed. Disabling BuilderFlag::" + << type << "." << std::endl; + } + }; + disableAndLog(fp16, "fp16", "kFP16"); + disableAndLog(int8, "int8", "kINT8"); + disableAndLog(bf16, "bf16", "kBF16"); + disableAndLog(fp8, "fp8", "kFP8"); + disableAndLog(int4, "int4", "kINT4"); + } + + if (fp8 && int8) + { + throw std::invalid_argument("Invalid usage, fp8 and int8 aren't allowed to be enabled together."); + } getAndDelOption(arguments, "--safe", safe); - getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--buildDLAStandalone", buildDLAStandalone); + getAndDelOption(arguments, "--allowGPUFallback", allowGPUFallback); getAndDelOption(arguments, "--restricted", restricted); - + getAndDelOption(arguments, "--skipInference", skipInference); getAndDelOption(arguments, "--directIO", directIO); std::string precisionConstraintsString; @@ -720,10 +1279,11 @@ void BuildOptions::parse(Arguments& arguments) getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions); getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes); + getLayerDeviceTypes(arguments, "--layerDeviceTypes", layerDeviceTypes); if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE) { - sample::gLogWarning << "When --precisionConstraints flag is set to \"obey\" or \"prefer\", please add " + sample::gLogWarning << R"(When --precisionConstraints flag is set to "obey" or "prefer", please add )" << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output " << "types." << std::endl; } @@ -731,79 +1291,52 @@ void BuildOptions::parse(Arguments& arguments) && precisionConstraints == PrecisionConstraints::kNONE) { sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints " - << "flag is set to \"none\"." << std::endl; + << R"(flag is set to "none".)" << std::endl; } - std::string sparsityString; - getAndDelOption(arguments, "--sparsity", sparsityString); - if (sparsityString == "disable") - { - sparsity = SparsityFlag::kDISABLE; - } - else if (sparsityString == "enable") - { - sparsity = SparsityFlag::kENABLE; - } - else if (sparsityString == "force") - { - sparsity = SparsityFlag::kFORCE; - } - else if (!sparsityString.empty()) - { - throw std::invalid_argument(std::string("Unknown sparsity mode: ") + sparsityString); - } + getStringsSet(arguments, "--markDebug", debugTensors); + + getAndDelOption(arguments, "--sparsity", sparsity); bool calibCheck = getAndDelOption(arguments, "--calib", calibration); - if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty()) + if (int8 && calibCheck && !optProfiles[calibProfile].empty() && shapesCalib.empty()) { - shapesCalib = shapes; + shapesCalib = optProfiles[calibProfile]; } - - std::string profilingVerbosityString; - if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString)) + else if (!shapesCalib.empty() && getCalibProfile) { - sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl; + sample::gLogWarning + << "--calibProfile have no effect when --minShapesCalib/--optShapesCalib/--maxShapesCalib is set." + << std::endl; } + std::string profilingVerbosityString; + getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString); if (profilingVerbosityString == "layer_names_only") { -#if (NV_TENSORRT_MAJOR > 7) profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif } else if (profilingVerbosityString == "none") { profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE; } -#if (NV_TENSORRT_MAJOR > 7) else if (profilingVerbosityString == "detailed") { profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; } -#endif else if (profilingVerbosityString == "default") { -#if (NV_TENSORRT_MAJOR > 7) sample::gLogWarning << "--profilingVerbosity=default has been deprecated by " "--profilingVerbosity=layer_names_only." << std::endl; profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif } else if (profilingVerbosityString == "verbose") { -#if (NV_TENSORRT_MAJOR > 7) sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed." << std::endl; profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif } else if (!profilingVerbosityString.empty()) { @@ -814,6 +1347,8 @@ void BuildOptions::parse(Arguments& arguments) { load = true; } + getAndDelOption(arguments, "--getPlanVersionOnly", getPlanVersionOnly); + if (getAndDelOption(arguments, "--saveEngine", engine)) { save = true; @@ -858,12 +1393,18 @@ void BuildOptions::parse(Arguments& arguments) { source = nvinfer1::TacticSource::kCUBLAS_LT; } -#if (NV_TENSORRT_MAJOR > 7) else if (t == "CUDNN") { source = nvinfer1::TacticSource::kCUDNN; } -#endif + else if (t == "EDGE_MASK_CONVOLUTIONS") + { + source = nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS; + } + else if (t == "JIT_CONVOLUTIONS") + { + source = nvinfer1::TacticSource::kJIT_CONVOLUTIONS; + } else { throw std::invalid_argument(std::string("Unknown tactic source: ") + t); @@ -887,38 +1428,179 @@ void BuildOptions::parse(Arguments& arguments) } } - bool noBuilderCache{false}; - getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); - getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); - if (noBuilderCache) + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + if (noBuilderCache) + { + timingCacheMode = TimingCacheMode::kDISABLE; + } + else if (!timingCacheFile.empty()) + { + timingCacheMode = TimingCacheMode::kGLOBAL; + } + else + { + timingCacheMode = TimingCacheMode::kLOCAL; + } + getAndDelOption(arguments, "--errorOnTimingCacheMiss", errorOnTimingCacheMiss); + getAndDelOption(arguments, "--builderOptimizationLevel", builderOptimizationLevel); + getAndDelOption(arguments, "--maxTactics", maxTactics); + + std::string runtimePlatformArgs; + getAndDelOption(arguments, "--runtimePlatform", runtimePlatformArgs); + if (runtimePlatformArgs == "SameAsBuild" || runtimePlatformArgs.empty()) + { + runtimePlatform = RuntimePlatform::kSAME_AS_BUILD; + } + else if (runtimePlatformArgs == "WindowsAMD64") + { + runtimePlatform = RuntimePlatform::kWINDOWS_AMD64; + } + else + { + throw std::invalid_argument(std::string("Unknown runtime platform: ") + runtimePlatformArgs + + ". Valid options: SameAsBuild, WindowsAMD64."); + } + + std::string hardwareCompatibleArgs; + getAndDelOption(arguments, "--hardwareCompatibilityLevel", hardwareCompatibleArgs); + if (hardwareCompatibleArgs == "none" || hardwareCompatibleArgs.empty()) + { + hardwareCompatibilityLevel = HardwareCompatibilityLevel::kNONE; + } + else if (samplesCommon::toLower(hardwareCompatibleArgs) == "ampere+") + { + hardwareCompatibilityLevel = HardwareCompatibilityLevel::kAMPERE_PLUS; + } + else + { + throw std::invalid_argument(std::string("Unknown hardwareCompatibilityLevel: ") + hardwareCompatibleArgs + + ". Valid options: none, ampere+."); + } + + if (pluginInstanceNorm && (versionCompatible || hardwareCompatibilityLevel == HardwareCompatibilityLevel::kAMPERE_PLUS)) + { + throw std::invalid_argument("Plugin InstanceNorm cannot be used with version compatible or hardware compatible engines!"); + } + + getAndDelOption(arguments, "--maxAuxStreams", maxAuxStreams); + + std::string previewFeaturesBuf; + getAndDelOption(arguments, "--preview", previewFeaturesBuf); + std::vector previewFeaturesVec{splitToStringVec(previewFeaturesBuf, ',')}; + for (auto featureName : previewFeaturesVec) + { + bool enable{false}; + if (featureName.front() == '+') + { + enable = true; + } + else if (featureName.front() != '-') + { + throw std::invalid_argument( + "Preview features must be prefixed with + or -, indicating whether it should be enabled or disabled " + "respectively."); + } + featureName.erase(0, 1); + + PreviewFeature feat{}; + if (featureName == "profileSharing0806") + { + sample::gLogWarning + << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect." + << std::endl; + } + else if (featureName == "aliasedPluginIO1003") + { + feat = PreviewFeature::kALIASED_PLUGIN_IO_10_03; + } + else + { + throw std::invalid_argument(std::string("Unknown preview feature: ") + featureName); + } + previewFeatures[static_cast(feat)] = enable; + } + + getAndDelOption(arguments, "--tempdir", tempdir); + getTempfileControls(arguments, "--tempfileControls", tempfileControls); + + std::string runtimeMode; + getAndDelOption(arguments, "--useRuntime", runtimeMode); + if (runtimeMode == "full") { - timingCacheMode = TimingCacheMode::kDISABLE; + useRuntime = RuntimeMode::kFULL; } - else if (!timingCacheFile.empty()) + else if (runtimeMode == "dispatch") { - timingCacheMode = TimingCacheMode::kGLOBAL; + useRuntime = RuntimeMode::kDISPATCH; } - else + else if (runtimeMode == "lean") { - timingCacheMode = TimingCacheMode::kLOCAL; + useRuntime = RuntimeMode::kLEAN; + } + else if (!runtimeMode.empty()) + { + throw std::invalid_argument(std::string("Unknown useRuntime: ") + runtimeMode); } + + if ((useRuntime == RuntimeMode::kDISPATCH || useRuntime == RuntimeMode::kLEAN) && !versionCompatible) + { + versionCompatible = true; + sample::gLogWarning << "Implicitly enabling --versionCompatible since --useRuntime=" << runtimeMode + << " is set." << std::endl; + } + + if (useRuntime != RuntimeMode::kFULL && !load) + { + throw std::invalid_argument(std::string("Building a TensorRT engine requires --useRuntime=full.")); + } + + getAndDelOption(arguments, "--leanDLLPath", leanDLLPath); + + // Don't delete the option because the inference option parser requires it + getOption(arguments, "--allowWeightStreaming", allowWeightStreaming); } void SystemOptions::parse(Arguments& arguments) { getAndDelOption(arguments, "--device", device); getAndDelOption(arguments, "--useDLACore", DLACore); - getAndDelOption(arguments, "--allowGPUFallback", fallback); +#if !TRT_WINML std::string pluginName; while (getAndDelOption(arguments, "--plugins", pluginName)) { + sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl; plugins.emplace_back(pluginName); } + while (getAndDelOption(arguments, "--staticPlugins", pluginName)) + { + plugins.emplace_back(pluginName); + } + while (getAndDelOption(arguments, "--setPluginsToSerialize", pluginName)) + { + setPluginsToSerialize.emplace_back(pluginName); + } + while (getAndDelOption(arguments, "--dynamicPlugins", pluginName)) + { + dynamicPlugins.emplace_back(pluginName); + } + getAndDelOption(arguments, "--ignoreParsedPluginLibs", ignoreParsedPluginLibs); +#endif } +constexpr int64_t WeightStreamingBudget::kDISABLE; +constexpr int64_t WeightStreamingBudget::kAUTOMATIC; + void InferenceOptions::parse(Arguments& arguments) { - getAndDelOption(arguments, "--streams", streams); + + if (getAndDelOption(arguments, "--streams", infStreams)) + { + sample::gLogWarning << "--streams flag has been deprecated, use --infStreams flag instead." << std::endl; + } + getAndDelOption(arguments, "--infStreams", infStreams); + getAndDelOption(arguments, "--iterations", iterations); getAndDelOption(arguments, "--duration", duration); getAndDelOption(arguments, "--warmUp", warmup); @@ -935,9 +1617,9 @@ void InferenceOptions::parse(Arguments& arguments) getAndDelOption(arguments, "--threads", threads); getAndDelOption(arguments, "--useCudaGraph", graph); getAndDelOption(arguments, "--separateProfileRun", rerun); - getAndDelOption(arguments, "--buildOnly", skip); getAndDelOption(arguments, "--timeDeserialize", timeDeserialize); getAndDelOption(arguments, "--timeRefit", timeRefit); + getAndDelOption(arguments, "--persistentCacheRatio", persistentCacheRatio); std::string list; getAndDelOption(arguments, "--loadInputs", list); @@ -945,25 +1627,81 @@ void InferenceOptions::parse(Arguments& arguments) splitInsertKeyValue(inputsList, inputs); getShapesInference(arguments, shapes, "--shapes"); - getAndDelOption(arguments, "--batch", batch); + setOptProfile = getAndDelOption(arguments, "--useProfile", optProfileIndex); + + std::string allocationStrategyString; + getAndDelOption(arguments, "--allocationStrategy", allocationStrategyString); + if (allocationStrategyString == "static") + { + memoryAllocationStrategy = MemoryAllocationStrategy::kSTATIC; + } + else if (allocationStrategyString == "profile") + { + memoryAllocationStrategy = MemoryAllocationStrategy::kPROFILE; + } + else if (allocationStrategyString == "runtime") + { + memoryAllocationStrategy = MemoryAllocationStrategy::kRUNTIME; + } + else if (!allocationStrategyString.empty()) + { + throw std::invalid_argument(std::string("Unknown allocationStrategy: ") + allocationStrategyString); + } + + bool allowWs{false}; + getAndDelOption(arguments, "--allowWeightStreaming", allowWs); + bool wsBudgetFound = getAndDelOption(arguments, "--weightStreamingBudget", weightStreamingBudget); + if (wsBudgetFound && !allowWs) + { + throw std::invalid_argument( + "The weight streaming budget can only be set with --allowWeightStreaming specified."); + } + if (allowWs && weightStreamingBudget.isDisabled()) + { + sample::gLogWarning << "The engine can stream its weights but it will not at runtime because " + "--weightStreamingBudget unset or set to " + << WeightStreamingBudget::kDISABLE << "." << std::endl; + } + + std::string debugTensorList; + getAndDelOption(arguments, "--saveDebugTensors", debugTensorList); + std::vector fileNames{splitToStringVec(debugTensorList, ',')}; + splitInsertKeyValue(fileNames, debugTensorFileNames); } void ReportingOptions::parse(Arguments& arguments) { - getAndDelOption(arguments, "--percentile", percentile); getAndDelOption(arguments, "--avgRuns", avgs); getAndDelOption(arguments, "--verbose", verbose); getAndDelOption(arguments, "--dumpRefit", refit); getAndDelOption(arguments, "--dumpOutput", output); + getAndDelOption(arguments, "--dumpRawBindingsToFile", dumpRawBindings); getAndDelOption(arguments, "--dumpProfile", profile); getAndDelOption(arguments, "--dumpLayerInfo", layerInfo); + getAndDelOption(arguments, "--dumpOptimizationProfile", optProfileInfo); getAndDelOption(arguments, "--exportTimes", exportTimes); getAndDelOption(arguments, "--exportOutput", exportOutput); getAndDelOption(arguments, "--exportProfile", exportProfile); getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo); - if (percentile < 0 || percentile > 100) + + std::string percentileString; + getAndDelOption(arguments, "--percentile", percentileString); + std::vector percentileStrings = splitToStringVec(percentileString, ','); + if (!percentileStrings.empty()) + { + percentiles.clear(); + } + for (const auto& p : percentileStrings) { - throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]"); + percentiles.push_back(stringToValue(p)); + } + + for (auto percentile : percentiles) + { + if (percentile < 0.F || percentile > 100.F) + { + throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]"); + } } } @@ -983,61 +1721,40 @@ void AllOptions::parse(Arguments& arguments) system.parse(arguments); inference.parse(arguments); - // Use explicitBatch when input model is ONNX or when dynamic shapes are used. - const bool isOnnx{model.baseModel.format == ModelFormat::kONNX}; - const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()}; - const bool detectedExplicitBatch = isOnnx || hasDynamicShapes; - - // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim. - const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided}; - const bool batchWasSet{inference.batch != batchNotProvided}; - if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet)) + if (build.useRuntime != RuntimeMode::kFULL && inference.timeRefit) { - throw std::invalid_argument( - "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes " - "are provided. Please use --optShapes and --shapes to set input shapes instead."); + throw std::invalid_argument("--timeRefit requires --useRuntime=full."); } - // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values. - if (!detectedExplicitBatch) + if (inference.optProfileIndex < static_cast(build.optProfiles.size())) { - // If batch is not set, set it to default value. - if (!batchWasSet) - { - inference.batch = defaultBatch; - } - // If maxBatch is not set, set it to be equal to batch. - if (!maxBatchWasSet) + // Propagate shape profile between builder and inference + for (auto const& s : build.optProfiles[inference.optProfileIndex]) { - build.maxBatch = inference.batch; + if (inference.shapes.find(s.first) == inference.shapes.end()) + { + insertShapesInference( + inference.shapes, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } } - // MaxBatch should not be less than batch. - if (build.maxBatch < inference.batch) + for (auto const& s : inference.shapes) { - throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch) - + " is less than inference batch " + std::to_string(inference.batch)); + if (build.optProfiles[inference.optProfileIndex].find(s.first) + == build.optProfiles[inference.optProfileIndex].end()) + { + // assume min/opt/max all the same + insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kMIN, + s.first, s.second); + insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kOPT, + s.first, s.second); + insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kMAX, + s.first, s.second); + } } } - if (build.shapes.empty() && !inference.shapes.empty()) - { - // If --shapes are provided but --optShapes are not, assume that optShapes is the same as shapes. - for (auto& s : inference.shapes) - { - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second); - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second); - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second); - } - } - else if (!build.shapes.empty() && inference.shapes.empty()) - { - // If --optShapes are provided but --shapes are not, assume that shapes is the same as optShapes. - for (auto& s : build.shapes) - { - insertShapesInference( - inference.shapes, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - } - } + // Set nvtxVerbosity to be the same as build-time profilingVerbosity. + inference.nvtxVerbosity = build.profilingVerbosity; reporting.parse(arguments); helps = parseHelp(arguments); @@ -1050,31 +1767,56 @@ void AllOptions::parse(Arguments& arguments) } if (build.safe && system.DLACore >= 0) { - auto checkSafeDLAFormats = [](std::vector const& fmt) { - return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) { + build.buildDLAStandalone = true; + } + if (build.runtimePlatform != nvinfer1::RuntimePlatform::kSAME_AS_BUILD) + { + build.skipInference = true; + } + if (build.buildDLAStandalone) + { + build.skipInference = true; + auto checkSafeDLAFormats = [](std::vector const& fmt, bool isInput) { + return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [&](IOFormat const& pair) { bool supported{false}; - bool const isLINEAR{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kLINEAR)}; - bool const isCHW4{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW4)}; + bool const isDLA_LINEAR{ + pair.second == 1U << static_cast(nvinfer1::TensorFormat::kDLA_LINEAR)}; + bool const isHWC4{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW4) + || pair.second == 1U << static_cast(nvinfer1::TensorFormat::kDLA_HWC4)}; bool const isCHW32{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW32)}; bool const isCHW16{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW16)}; - supported |= pair.first == nvinfer1::DataType::kINT8 && (isLINEAR || isCHW4 || isCHW32); - supported |= pair.first == nvinfer1::DataType::kHALF && (isLINEAR || isCHW4 || isCHW16); + supported |= pair.first == nvinfer1::DataType::kINT8 + && (isDLA_LINEAR || (isInput ? isHWC4 : false) || isCHW32); + supported |= pair.first == nvinfer1::DataType::kHALF + && (isDLA_LINEAR || (isInput ? isHWC4 : false) || isCHW16); return supported; }); }; - if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats)) + if (!checkSafeDLAFormats(build.inputFormats, true) || !checkSafeDLAFormats(build.outputFormats, false)) { throw std::invalid_argument( - "I/O formats for safe DLA capability are restricted to fp16/int8:linear, fp16:chw16 or int8:chw32"); + "I/O formats for safe DLA capability are restricted to fp16/int8:dla_linear, fp16/int8:hwc4, " + "fp16:chw16 or " + "int8:chw32"); } - if (system.fallback) + if (build.allowGPUFallback) { - throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability"); + throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for DLA standalone mode"); } } } } +void TaskInferenceOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "engine", engine); + getAndDelOption(arguments, "device", device); + getAndDelOption(arguments, "batch", batch); + getAndDelOption(arguments, "DLACore", DLACore); + getAndDelOption(arguments, "graph", graph); + getAndDelOption(arguments, "persistentCacheRatio", persistentCacheRatio); +} + void SafeBuilderOptions::parse(Arguments& arguments) { auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { @@ -1097,13 +1839,36 @@ void SafeBuilderOptions::parse(Arguments& arguments) getFormats(outputFormats, "--outputIOFormats"); getAndDelOption(arguments, "--int8", int8); getAndDelOption(arguments, "--calib", calibFile); - getAndDelOption(arguments, "--consistency", consistency); getAndDelOption(arguments, "--std", standard); +#if !TRT_WINML std::string pluginName; while (getAndDelOption(arguments, "--plugins", pluginName)) { + sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl; plugins.emplace_back(pluginName); } + while (getAndDelOption(arguments, "--staticPlugins", pluginName)) + { + plugins.emplace_back(pluginName); + } +#endif + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + getAndDelOption(arguments, "--avgTiming", avgTiming); + if (noBuilderCache) + { + timingCacheMode = TimingCacheMode::kDISABLE; + } + else if (!timingCacheFile.empty()) + { + timingCacheMode = TimingCacheMode::kGLOBAL; + } + else + { + timingCacheMode = TimingCacheMode::kLOCAL; + } + getAndDelOption(arguments, "--sparsity", sparsity); } std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) @@ -1113,59 +1878,25 @@ std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) os << "Format: "; switch (options.format) { - case ModelFormat::kCAFFE: - { - os << "Caffe"; - break; - } case ModelFormat::kONNX: { os << "ONNX"; break; } - case ModelFormat::kUFF: - { - os << "UFF"; - break; - } - case ModelFormat::kANY: - os << "*"; - break; + case ModelFormat::kANY: os << "*"; break; } os << std::endl << "Model: " << options.model << std::endl; return os; } -std::ostream& operator<<(std::ostream& os, const UffInput& input) -{ - os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl; - for (const auto& i : input.inputs) - { - os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl; - } - - return os; -} - std::ostream& operator<<(std::ostream& os, const ModelOptions& options) { os << options.baseModel; switch (options.baseModel.format) { - case ModelFormat::kCAFFE: - { - os << "Prototxt: " << options.prototxt << std::endl; - break; - } - case ModelFormat::kUFF: - { - os << options.uffInputs; - break; - } case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case - case ModelFormat::kANY: - break; + case ModelFormat::kANY: break; } os << "Output:"; @@ -1192,6 +1923,11 @@ std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) os << "fp16"; break; } + case nvinfer1::DataType::kBF16: + { + os << "bf16"; + break; + } case nvinfer1::DataType::kINT8: { os << "int8"; @@ -1207,6 +1943,26 @@ std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) os << "bool"; break; } + case nvinfer1::DataType::kUINT8: + { + os << "uint8"; + break; + } + case nvinfer1::DataType::kFP8: + { + os << "fp8"; + break; + } + case nvinfer1::DataType::kINT64: + { + os << "int64"; + break; + } + case nvinfer1::DataType::kINT4: + { + os << "int4"; + break; + } } return os; } @@ -1240,13 +1996,11 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format) os << "hwc8"; break; } -#if (NV_TENSORRT_MAJOR > 7) case nvinfer1::TensorFormat::kHWC16: { os << "hwc16"; break; } -#endif case nvinfer1::TensorFormat::kCHW4: { os << "chw4"; @@ -1277,6 +2031,11 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format) os << "hwc"; break; } + case nvinfer1::TensorFormat::kDHWC: + { + os << "dhwc"; + break; + } case nvinfer1::TensorFormat::kDLA_LINEAR: { os << "dla_linear"; @@ -1293,6 +2052,42 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format) return os; } +std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType) +{ + switch (devType) + { + case nvinfer1::DeviceType::kGPU: + { + os << "GPU"; + break; + } + case nvinfer1::DeviceType::kDLA: + { + os << "DLA"; + break; + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, nvinfer1::RuntimePlatform platform) +{ + switch (platform) + { + case nvinfer1::RuntimePlatform::kSAME_AS_BUILD: + { + os << "Same As Build"; + break; + } + case nvinfer1::RuntimePlatform::kWINDOWS_AMD64: + { + os << "Windows AMD64"; + break; + } + } + return os; +} + std::ostream& operator<<(std::ostream& os, const ShapeRange& dims) { int32_t i = 0; @@ -1319,29 +2114,76 @@ std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecision return os; } +std::ostream& operator<<(std::ostream& os, LayerDeviceTypes const& layerDeviceTypes) +{ + int32_t i = 0; + for (auto const& layerDevicePair : layerDeviceTypes) + { + os << (i++ ? ", " : "") << layerDevicePair.first << ":" << layerDevicePair.second; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, StringSet const& stringSet) +{ + int64_t i = 0; + for (auto const& s : stringSet) + { + os << (i ? "," : "") << s; + ++i; + } + return os; +} + std::ostream& operator<<(std::ostream& os, const BuildOptions& options) { + // if loadEngine is specified, BuildOptions are N/A + if (options.load) + { + os << std::endl; + return os; + } // clang-format off os << "=== Build Options ===" << std::endl << - - "Max batch: "; printBatch(os, options.maxBatch) << std::endl << "Memory Pools: "; printMemoryPools(os, options) << std::endl << - "minTiming: " << options.minTiming << std::endl << "avgTiming: " << options.avgTiming << std::endl << "Precision: "; printPrecision(os, options) << std::endl << "LayerPrecisions: " << options.layerPrecisions << std::endl << + "Layer Device Types: " << options.layerDeviceTypes << std::endl << "Calibration: " << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl << "Refit: " << boolToEnabled(options.refittable) << std::endl << + "Strip weights: " << boolToEnabled(options.stripWeights) << std::endl << + "Version Compatible: " << boolToEnabled(options.versionCompatible) << std::endl << +#if !TRT_WINML + "ONNX Plugin InstanceNorm: " << boolToEnabled(options.pluginInstanceNorm) << std::endl << +#endif + "TensorRT runtime: " << options.useRuntime << std::endl << + "Lean DLL Path: " << options.leanDLLPath << std::endl << + "Tempfile Controls: "; printTempfileControls(os, options.tempfileControls) << std::endl << + "Exclude Lean Runtime: " << boolToEnabled(options.excludeLeanRuntime) << std::endl << "Sparsity: "; printSparsity(os, options) << std::endl << "Safe mode: " << boolToEnabled(options.safe) << std::endl << + "Build DLA standalone loadable: " << boolToEnabled(options.buildDLAStandalone) << std::endl << + "Allow GPU fallback for DLA: " << boolToEnabled(options.allowGPUFallback) << std::endl << "DirectIO mode: " << boolToEnabled(options.directIO) << std::endl << "Restricted mode: " << boolToEnabled(options.restricted) << std::endl << + "Skip inference: " << boolToEnabled(options.skipInference) << std::endl << "Save engine: " << (options.save ? options.engine : "") << std::endl << "Load engine: " << (options.load ? options.engine : "") << std::endl << "Profiling verbosity: " << static_cast(options.profilingVerbosity) << std::endl << "Tactic sources: "; printTacticSources(os, options.enabledTactics, options.disabledTactics) << std::endl << - "timingCacheMode: "; printTimingCache(os, options) << std::endl << - "timingCacheFile: " << options.timingCacheFile << std::endl; + "timingCacheMode: "; printTimingCache(os, options.timingCacheMode) << std::endl << + "timingCacheFile: " << options.timingCacheFile << std::endl << + "Enable Compilation Cache: "<< boolToEnabled(!options.disableCompilationCache) << std::endl << + "errorOnTimingCacheMiss: " << boolToEnabled(options.errorOnTimingCacheMiss) << std::endl << + "Preview Features: "; printPreviewFlags(os, options) << std::endl << + "MaxAuxStreams: " << options.maxAuxStreams << std::endl << + "BuilderOptimizationLevel: " << options.builderOptimizationLevel << std::endl << + "MaxTactics: " << options.maxTactics << std::endl << + "Calibration Profile Index: " << options.calibProfile << std::endl << + "Weight Streaming: " << boolToEnabled(options.allowWeightStreaming) << std::endl << + "Runtime Platform: " << options.runtimePlatform << std::endl << + "Debug Tensors: " << options.debugTensors << std::endl; // clang-format on auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { @@ -1351,7 +2193,7 @@ std::ostream& operator<<(std::ostream& os, const BuildOptions& options) } else { - for(const auto& f : formats) + for (const auto& f : formats) { os << direction << ": " << f << std::endl; } @@ -1360,8 +2202,11 @@ std::ostream& operator<<(std::ostream& os, const BuildOptions& options) printIOFormats(os, "Input(s)", options.inputFormats); printIOFormats(os, "Output(s)", options.outputFormats); - printShapes(os, "build", options.shapes); - printShapes(os, "calibration", options.shapesCalib); + for (size_t i = 0; i < options.optProfiles.size(); i++) + { + printShapes(os, "build", options.optProfiles[i], i); + } + printShapes(os, "calibration", options.shapesCalib, -1); return os; } @@ -1372,8 +2217,8 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options) os << "=== System Options ===" << std::endl << "Device: " << options.device << std::endl << - "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << - (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl; + "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << std::endl; +#if !TRT_WINML os << "Plugins:"; for (const auto& p : options.plugins) @@ -1382,13 +2227,32 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options) } os << std::endl; + os << "setPluginsToSerialize:"; + + for (const auto& p : options.setPluginsToSerialize) + { + os << " " << p; + } + os << std::endl; + + os << "dynamicPlugins:"; + + for (const auto& p : options.dynamicPlugins) + { + os << " " << p; + } + os << std::endl; + + os << "ignoreParsedPluginLibs: " << options.ignoreParsedPluginLibs << std::endl; + os << std::endl; +#endif return os; // clang-format on } std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) { -// clang-format off + // clang-format off os << "=== Inference Options ===" << std::endl << "Batch: "; @@ -1400,48 +2264,71 @@ std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) { os << "Explicit" << std::endl; } - printShapes(os, "inference", options.shapes); - os << "Iterations: " << options.iterations << std::endl << - "Duration: " << options.duration << "s (+ " - << options.warmup << "ms warm up)" << std::endl << - "Sleep time: " << options.sleep << "ms" << std::endl << - "Idle time: " << options.idle << "ms" << std::endl << - "Streams: " << options.streams << std::endl << - "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << - "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << - "Spin-wait: " << boolToEnabled(options.spin) << std::endl << - "Multithreading: " << boolToEnabled(options.threads) << std::endl << - "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << - "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << - "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << - "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << - "Skip inference: " << boolToEnabled(options.skip) << std::endl; - -// clang-format on + printShapes(os, "inference", options.shapes, options.optProfileIndex); + + std::string wsBudget{"Disabled"}; + if (options.weightStreamingBudget.bytes == WeightStreamingBudget::kAUTOMATIC) + { + wsBudget = "Automatic"; + } + else if (options.weightStreamingBudget.bytes != WeightStreamingBudget::kDISABLE) + { + wsBudget = std::to_string(options.weightStreamingBudget.bytes) + " bytes"; + } + else if (options.weightStreamingBudget.percent != WeightStreamingBudget::kDISABLE) + { + wsBudget = std::to_string(options.weightStreamingBudget.percent) + "%"; + } + + os << "Iterations: " << options.iterations << std::endl << + "Duration: " << options.duration << "s (+ " + << options.warmup << "ms warm up)" << std::endl << + "Sleep time: " << options.sleep << "ms" << std::endl << + "Idle time: " << options.idle << "ms" << std::endl << + "Inference Streams: " << options.infStreams << std::endl << + "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << + "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << + "Spin-wait: " << boolToEnabled(options.spin) << std::endl << + "Multithreading: " << boolToEnabled(options.threads) << std::endl << + "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << + "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << + "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << + "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << + "NVTX verbosity: " << static_cast(options.nvtxVerbosity) << std::endl << + "Persistent Cache Ratio: " << static_cast(options.persistentCacheRatio) << std::endl << + "Optimization Profile Index: "<< options.optProfileIndex << std::endl << + "Weight Streaming Budget: " << wsBudget << std::endl; + // clang-format on + os << "Inputs:" << std::endl; for (const auto& input : options.inputs) { os << input.first << "<-" << input.second << std::endl; } + os << "Debug Tensor Save Destinations:" << std::endl; + for (auto const& fileName : options.debugTensorFileNames) + { + os << fileName.first << ": " << fileName.second << std::endl; + } + return os; } std::ostream& operator<<(std::ostream& os, const ReportingOptions& options) { -// clang-format off - os << "=== Reporting Options ===" << std::endl << - - "Verbose: " << boolToEnabled(options.verbose) << std::endl << - "Averages: " << options.avgs << " inferences" << std::endl << - "Percentile: " << options.percentile << std::endl << - "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << - "Dump output: " << boolToEnabled(options.output) << std::endl << - "Profile: " << boolToEnabled(options.profile) << std::endl << - "Export timing to JSON file: " << options.exportTimes << std::endl << - "Export output to JSON file: " << options.exportOutput << std::endl << - "Export profile to JSON file: " << options.exportProfile << std::endl; -// clang-format on + // clang-format off + os << "=== Reporting Options ===" << std::endl << + "Verbose: " << boolToEnabled(options.verbose) << std::endl << + "Averages: " << options.avgs << " inferences" << std::endl << + "Percentiles: " << joinValuesToString(options.percentiles, ",") << std::endl << + "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << + "Dump output: " << boolToEnabled(options.output) << std::endl << + "Profile: " << boolToEnabled(options.profile) << std::endl << + "Export timing to JSON file: " << options.exportTimes << std::endl << + "Export output to JSON file: " << options.exportOutput << std::endl << + "Export profile to JSON file: " << options.exportProfile << std::endl; + // clang-format on return os; } @@ -1461,7 +2348,7 @@ std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) } else { - for(const auto& f : formats) + for (const auto& f : formats) { os << direction << ": " << f << std::endl; } @@ -1476,197 +2363,288 @@ std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) { os << " + INT8"; } + if (options.fp8) + { + os << " + FP8"; + } + if (options.int4) + { + os << " + INT4"; + } os << std::endl; os << "Calibration file: " << options.calibFile << std::endl; os << "Serialized Network: " << options.serialized << std::endl; printIOFormats(os, "Input(s)", options.inputFormats); printIOFormats(os, "Output(s)", options.outputFormats); - +#if !TRT_WINML os << "Plugins:"; for (const auto& p : options.plugins) { os << " " << p; } +#endif + os << "timingCacheMode: "; + printTimingCache(os, options.timingCacheMode) << std::endl; + os << "timingCacheFile: " << options.timingCacheFile << std::endl; os << std::endl; return os; } void BaseModelOptions::help(std::ostream& os) { -// clang-format off - os << " --uff= UFF model" << std::endl << - " --onnx= ONNX model" << std::endl << - " --model= Caffe model (default = no model, random weights used)" << std::endl; -// clang-format on -} - -void UffInput::help(std::ostream& os) -{ -// clang-format off - os << " --uffInput=,X,Y,Z Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified " - "multiple times; at least one is required for UFF models" << std::endl << - " --uffNHWC Set if inputs are in the NHWC layout instead of NCHW (use " << - "X,Y,Z=H,W,C order in --uffInput)" << std::endl; -// clang-format on + // clang-format off + os << " --onnx= ONNX model" << std::endl; + // clang-format on } void ModelOptions::help(std::ostream& os) { -// clang-format off + // clang-format off os << "=== Model Options ===" << std::endl; BaseModelOptions::help(os); - os << " --deploy= Caffe prototxt file" << std::endl << - " --output=[,]* Output names (it can be specified multiple times); at least one output " - "is required for UFF and Caffe" << std::endl; - UffInput::help(os); -// clang-format on + // clang-format on } void BuildOptions::help(std::ostream& os) { -// clang-format off - os << "=== Build Options ===" "\n" - " --maxBatch Set max batch size and build an implicit batch engine (default = same size as --batch)" "\n" - " This option should not be used when the input model is ONNX or when dynamic shapes are provided." "\n" - " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" - " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" - " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" - " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" - " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" - " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" - " Note: All three of min, opt and max shapes must be supplied." "\n" - " However, if only opt shapes is supplied then it will be expanded so" "\n" - " that min shapes and max shapes are set to the same values as opt shapes." "\n" - " Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." "\n" - " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" - " Each input shape is supplied as a key-value pair where key is the input name and" "\n" - " value is the dimensions (including the batch dimension) to be used for that input." "\n" - " Each key-value pair has the key and value separated using a colon (:)." "\n" - " Multiple input shapes can be provided via comma-separated key-value pairs." "\n" - " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" - " See --outputIOFormats help for the grammar of type and format list." "\n" - " Note: If this option is specified, please set comma-separated types and formats for all" "\n" - " inputs following the same order as network inputs ID (even if only one input" "\n" - " needs specifying IO format) or set the type and format once for broadcasting." "\n" - " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" - " Note: If this option is specified, please set comma-separated types and formats for all" "\n" - " outputs following the same order as network outputs ID (even if only one output" "\n" - " needs specifying IO format) or set the type and format once for broadcasting." "\n" - " IO Formats: spec ::= IOfmt[\",\"spec]" "\n" - " IOfmt ::= type:fmt" "\n" - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" - " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n" - " --workspace=N Set workspace size in MiB." "\n" - " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s) in MiB." "\n" - " Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n" - " Pool constraint: poolspec ::= poolfmt[\",\"poolspec]" "\n" - " poolfmt ::= pool:sizeInMiB" "\n" - " pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\"" "\n" - " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)" "\n" - " --minTiming=M Set the minimum number of iterations used in kernel selection (default = " - << defaultMinTiming << ")" "\n" - " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " - << defaultAvgTiming << ")" "\n" - " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" - " and weights within the engine." "\n" - " --sparsity=spec Control sparsity (default = disabled). " "\n" - " Sparsity: spec ::= \"disable\", \"enable\", \"force\"" "\n" - " Note: Description about each of these options is as below" "\n" - " disable = do not enable sparse tactics in the builder (this is the default)" "\n" - " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" - " considered if the weights have the right sparsity pattern)" "\n" - " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" - " a sparsity pattern (even if you loaded a model yourself)" "\n" - " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" - " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" - " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" - " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" - " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" - " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" - " Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\"" "\n" - " none = no constraints" "\n" - " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" - " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" - " otherwise" "\n" - " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" - " \"obey\" or \"prefer\". (default = none)" "\n" - " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" - " layerName to specify the default precision for all the unspecified layers." "\n" - " Per-layer precision spec ::= layerPrecision[\",\"spec]" "\n" - " layerPrecision ::= layerName\":\"precision" "\n" - " precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" - " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" - " \"obey\" or \"prefer\". (default = none)" "\n" - " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" - " layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n" - " one output, then multiple types separated by \"+\" can be provided for this layer." "\n" - " Per-layer output type spec ::= layerOutputTypes[\",\"spec]" "\n" - " layerOutputTypes ::= layerName\":\"type" "\n" - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]" "\n" - " --calib= Read INT8 calibration cache file" "\n" - " --safe Enable build safety certified engine" "\n" - " --consistency Perform consistency checking on safety certified engine" "\n" - " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" - " --saveEngine= Save the serialized engine" "\n" - " --loadEngine= Load a serialized engine" "\n" - " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" - " tactic sources (default = all available tactics)." "\n" - " Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics." "\n" - " Tactic Sources: tactics ::= [\",\"tactic]" "\n" - " tactic ::= (+|-)lib" "\n" - " lib ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\"" "\n" - " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" - " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" - " --timingCacheFile= Save/load the serialized global timing cache" "\n" + // clang-format off + os << "=== Build Options ===" "\n" + " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" + " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" + " Note: All three of min, opt and max shapes must be supplied." "\n" + " However, if only opt shapes is supplied then it will be expanded so" "\n" + " that min shapes and max shapes are set to the same values as opt shapes." "\n" + " Input names can be wrapped with escaped single quotes (ex: 'Input:0')." "\n" + " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" + " For scalars (0-D shapes), use input0:scalar or simply input0: with nothing after the colon." "\n" + " Each input shape is supplied as a key-value pair where key is the input name and" "\n" + " value is the dimensions (including the batch dimension) to be used for that input." "\n" + " Each key-value pair has the key and value separated using a colon (:)." "\n" + " Multiple input shapes can be provided via comma-separated key-value pairs, and each input name can" "\n" + " contain at most one wildcard ('*') character." "\n" + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" + " See --outputIOFormats help for the grammar of type and format list." "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " inputs following the same order as network inputs ID (even if only one input" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " outputs following the same order as network outputs ID (even if only one output" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + R"( IO Formats: spec ::= IOfmt[","spec])" "\n" + " IOfmt ::= type:fmt" "\n" + R"( type ::= "fp32"|"fp16"|"bf16"|"int32"|"int64"|"int8"|"uint8"|"bool")" "\n" + R"( fmt ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)" "\n" + R"( "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])" "\n" + " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s)" "\n" + " Supports the following base-2 suffixes: " << getAvailableUnitSuffixes() << "." "\n" + " If none of suffixes is appended, the defualt unit is in MiB." "\n" + " Note: Also accepts decimal sizes, e.g. 0.25M. Will be rounded down to the nearest integer bytes." "\n" + " In particular, for dlaSRAM the bytes will be rounded down to the nearest power of 2." "\n" + R"( Pool constraint: poolspec ::= poolfmt[","poolspec])" "\n" + " poolfmt ::= pool:size" "\n" + R"( pool ::= "workspace"|"dlaSRAM"|"dlaLocalDRAM"|"dlaGlobalDRAM"|"tacticSharedMem")" "\n" + " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)." "\n" + " Please only assign once." "\n" + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " + << defaultAvgTiming << ")" "\n" + " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" + " and weights within the engine." "\n" + " --stripWeights Strip weights from plan. This flag works with either refit or refit with identical weights. Default""\n" + " to latter, but you can switch to the former by enabling both --stripWeights and --refit at the same""\n" + " time." "\n" + " --stripAllWeights Alias for combining the --refit and --stripWeights options. It marks all weights as refittable," "\n" + " disregarding any performance impact. Additionally, it strips all refittable weights after the " "\n" + " engine is built." "\n" + " --weightless [Deprecated] this knob has been deprecated. Please use --stripWeights" "\n" + " --versionCompatible, --vc Mark the engine as version compatible. This allows the engine to be used with newer versions" "\n" + " of TensorRT on the same host OS, as well as TensorRT's dispatch and lean runtimes." "\n" +#if !TRT_WINML + " --pluginInstanceNorm, --pi Set `kNATIVE_INSTANCENORM` to false in the ONNX parser. This will cause the ONNX parser to use" "\n" + " a plugin InstanceNorm implementation over the native implementation when parsing." "\n" +#endif + R"( --useRuntime=runtime TensorRT runtime to execute engine. "lean" and "dispatch" require loading VC engine and do)" "\n" + " not support building an engine." "\n" + R"( runtime::= "full"|"lean"|"dispatch")" "\n" + " --leanDLLPath= External lean runtime DLL to use in version compatiable mode." "\n" + " --excludeLeanRuntime When --versionCompatible is enabled, this flag indicates that the generated engine should" "\n" + " not include an embedded lean runtime. If this is set, the user must explicitly specify a" "\n" + " valid lean runtime to use when loading the engine." "\n" + " --sparsity=spec Control sparsity (default = disabled). " "\n" + R"( Sparsity: spec ::= "disable", "enable", "force")" "\n" + " Note: Description about each of these options is as below" "\n" + " disable = do not enable sparse tactics in the builder (this is the default)" "\n" + " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" + " considered if the weights have the right sparsity pattern)" "\n" + " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" + " a sparsity pattern (even if you loaded a model yourself)" "\n" + " [Deprecated] this knob has been deprecated." "\n" + " Please use to rewrite the weights." "\n" + " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" + " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" + " --bf16 Enable bf16 precision, in addition to fp32 (default = disabled)" "\n" + " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" + " --fp8 Enable fp8 precision, in addition to fp32 (default = disabled)" "\n" + " --int4 Enable int4 precision, in addition to fp32 (default = disabled)" "\n" + " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" + " --stronglyTyped Create a strongly typed network. (default = disabled)" "\n" + " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" + " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" + R"( Precision Constraints: spec ::= "none" | "obey" | "prefer")" "\n" + " none = no constraints" "\n" + " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" + " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" + " otherwise" "\n" + " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" + R"( "obey" or "prefer". (default = none))" "\n" + R"( The specs are read left-to-right, and later ones override earlier ones. Each layer name can)" "\n" + " contain at most one wildcard ('*') character." "\n" + R"( Per-layer precision spec ::= layerPrecision[","spec])" "\n" + R"( layerPrecision ::= layerName":"precision)" "\n" + R"( precision ::= "fp32"|"fp16"|"bf16"|"int32"|"int8")" "\n" + " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" + R"( "obey" or "prefer". (default = none)" "\n" + R"( The specs are read left-to-right, and later ones override earlier ones. Each layer name can)" "\n" + " contain at most one wildcard ('*') character. If a layer has more than" "\n" + R"( one output, then multiple types separated by "+" can be provided for this layer.)" "\n" + R"( Per-layer output type spec ::= layerOutputTypes[","spec])" "\n" + R"( layerOutputTypes ::= layerName":"type)" "\n" + R"( type ::= "fp32"|"fp16"|"bf16"|"int32"|"int8"["+"type])" "\n" + " --layerDeviceTypes=spec Specify layer-specific device type." "\n" + " The specs are read left-to-right, and later ones override earlier ones. If a layer does not have" "\n" + " a device type specified, the layer will opt for the default device type." "\n" + R"( Per-layer device type spec ::= layerDeviceTypePair[","spec])" "\n" + R"( layerDeviceTypePair ::= layerName":"deviceType)" "\n" + R"( deviceType ::= "GPU"|"DLA")" "\n" + " --calib= Read INT8 calibration cache file" "\n" + " --safe Enable build safety certified engine, if DLA is enable, --buildDLAStandalone will be specified" "\n" + " automatically (default = disabled)" "\n" + " --buildDLAStandalone Enable build DLA standalone loadable which can be loaded by cuDLA, when this option is enabled, " "\n" + " --allowGPUFallback is disallowed and --skipInference is enabled by default. Additionally, " "\n" + " specifying --inputIOFormats and --outputIOFormats restricts I/O data type and memory layout" "\n" + " (default = disabled)" "\n" + " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers (default = disabled)" "\n" + " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" + " --saveEngine= Save the serialized engine" "\n" + " --loadEngine= Load a serialized engine" "\n" + " --getPlanVersionOnly Print TensorRT version when loaded plan was created. Works without deserialization of the plan." "\n" + " Use together with --loadEngine. Supported only for engines created with 8.6 and forward." "\n" + " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" + " tactic sources (default = all available tactics)." "\n" + " Note: Currently only cuDNN, cuBLAS, cuBLAS-LT, and edge mask convolutions are listed as optional" "\n" + " tactics." "\n" + R"( Tactic Sources: tactics ::= [","tactic])" "\n" + " tactic ::= (+|-)lib" "\n" + R"( lib ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS")" "\n" + R"( |"JIT_CONVOLUTIONS")" "\n" + " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" + " --noCompilationCache Disable Compilation cache in builder, and the cache is part of timing cache (default is to enable compilation cache)" "\n" + " --errorOnTimingCacheMiss Emit error when a tactic being timed is not present in the timing cache (default = false)" "\n" + " --timingCacheFile= Save/load the serialized global timing cache" "\n" + " --preview=features Specify preview feature to be used by adding (+) or removing (-) preview features from the default" "\n" + R"( Preview Features: features ::= [","feature])" "\n" + " feature ::= (+|-)flag" "\n" + R"( flag ::= "aliasedPluginIO1003")" "\n" + R"( |"profileSharing0806")" "\n" + " --builderOptimizationLevel Set the builder optimization level. (default is 3)" "\n" + " Higher level allows TensorRT to spend more building time for more optimization options." "\n" + " Valid values include integers from 0 to the maximum optimization level, which is currently 5." "\n" + " --maxTactics Set the maximum number of tactics to time when there is a choice of tactics. (default is -1)" "\n" + " Larger number of tactics allow TensorRT to spend more building time on evaluating tactics." "\n" + " Default value -1 means TensorRT can decide the number of tactics based on its own heuristic." "\n" + " --hardwareCompatibilityLevel=mode Make the engine file compatible with other GPU architectures. (default = none)" "\n" + R"( Hardware Compatibility Level: mode ::= "none" | "ampere+")" "\n" + " none = no compatibility" "\n" + " ampere+ = compatible with Ampere and newer GPUs" "\n" + " --runtimePlatform=platform Set the target platform for runtime execution. (default = SameAsBuild)" "\n" + " When this option is enabled, --skipInference is enabled by default." "\n" + R"( RuntimePlatfrom: platform ::= "SameAsBuild" | "WindowsAMD64")" "\n" + " SameAsBuild = no requirement for cross-platform compatibility." "\n" + " WindowsAMD64 = set the target platform for engine execution as Windows AMD64 system" "\n" + " --tempdir= Overrides the default temporary directory TensorRT will use when creating temporary files." "\n" + " See IRuntime::setTemporaryDirectory API documentation for more information." "\n" + " --tempfileControls=controls Controls what TensorRT is allowed to use when creating temporary executable files." "\n" + " Should be a comma-separated list with entries in the format (in_memory|temporary):(allow|deny)." "\n" + " in_memory: Controls whether TensorRT is allowed to create temporary in-memory executable files." "\n" + " temporary: Controls whether TensorRT is allowed to create temporary executable files in the" "\n" + " filesystem (in the directory given by --tempdir)." "\n" + " For example, to allow in-memory files and disallow temporary files:" "\n" + " --tempfileControls=in_memory:allow,temporary:deny" "\n" + R"( If a flag is unspecified, the default behavior is "allow".)" "\n" + " --maxAuxStreams=N Set maximum number of auxiliary streams per inference stream that TRT is allowed to use to run " "\n" + " kernels in parallel if the network contains ops that can run in parallel, with the cost of more " "\n" + " memory usage. Set this to 0 for optimal memory usage. (default = using heuristics)" "\n" + " --profile Build with dynamic shapes using a profile with the min/max/opt shapes provided. Can be specified" "\n" + " multiple times to create multiple profiles with contiguous index." "\n" + " (ex: --profile=0 --minShapes= --optShapes= --maxShapes= --profile=1 ...)" "\n" + " --calibProfile Select the optimization profile to calibrate by index. (default = " + << defaultOptProfileIndex << ")" "\n" + " --allowWeightStreaming Enable a weight streaming engine. Must be specified with --stronglyTyped. TensorRT will disable" "\n" + " weight streaming at runtime unless --weightStreamingBudget is specified." "\n" + " --markDebug Specify list of names of tensors to be marked as debug tensors. Separate names with a comma" "\n" ; -// clang-format on + // clang-format on os << std::flush; } void SystemOptions::help(std::ostream& os) { -// clang-format off + // clang-format off os << "=== System Options ===" << std::endl << " --device=N Select cuda device N (default = " << defaultDevice << ")" << std::endl << " --useDLACore=N Select DLA core N for layers that support DLA (default = none)" << std::endl << - " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers " - "(default = disabled)" << std::endl; - os << " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl; -// clang-format on +#if TRT_WINML + std::endl; +#else + " --staticPlugins Plugin library (.so) to load statically (can be specified multiple times)" << std::endl << + " --dynamicPlugins Plugin library (.so) to load dynamically and may be serialized with the engine if they are included in --setPluginsToSerialize (can be specified multiple times)" << std::endl << + " --setPluginsToSerialize Plugin library (.so) to be serialized with the engine (can be specified multiple times)" << std::endl << + " --ignoreParsedPluginLibs By default, when building a version-compatible engine, plugin libraries specified by the ONNX parser " << std::endl << + " are implicitly serialized with the engine (unless --excludeLeanRuntime is specified) and loaded dynamically. " << std::endl << + " Enable this flag to ignore these plugin libraries instead." << std::endl; +#endif + // clang-format on } void InferenceOptions::help(std::ostream& os) { // clang-format off os << "=== Inference Options ===" << std::endl << - " --batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << - " This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl << - " shapes are provided when the engine is built." << std::endl << " --shapes=spec Set input shapes for dynamic shapes inference inputs." << std::endl << - " Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." << std::endl << + R"( Note: Input names can be wrapped with escaped single quotes (ex: 'Input:0').)" << std::endl << " Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128" << std::endl << + " For scalars (0-D shapes), use input0:scalar or simply input0: with nothing after the colon."<< std::endl << " Each input shape is supplied as a key-value pair where key is the input name and" << std::endl << " value is the dimensions (including the batch dimension) to be used for that input." << std::endl << " Each key-value pair has the key and value separated using a colon (:)." << std::endl << - " Multiple input shapes can be provided via comma-separated key-value pairs." << std::endl << + " Multiple input shapes can be provided via comma-separated key-value pairs, and each input " << std::endl << + " name can contain at most one wildcard ('*') character." << std::endl << " --loadInputs=spec Load input values from files (default = generate random inputs). Input names can be " "wrapped with single quotes (ex: 'Input:0')" << std::endl << - " Input values spec ::= Ival[\",\"spec]" << std::endl << - " Ival ::= name\":\"file" << std::endl << + R"( Input values spec ::= Ival[","spec])" << std::endl << + R"( Ival ::= name":"file)" << std::endl << + " Consult the README for more information on generating files for custom inputs." << std::endl << " --iterations=N Run at least N inference iterations (default = " << defaultIterations << ")" << std::endl << " --warmUp=N Run for N milliseconds to warmup before measuring performance (default = " << defaultWarmUp << ")" << std::endl << " --duration=N Run performance measurements for at least N seconds wallclock time (default = " << defaultDuration << ")" << std::endl << + " If -1 is specified, inference will keep running unless stopped manually" << std::endl << " --sleepTime=N Delay inference start with a gap of N milliseconds between launch and compute " "(default = " << defaultSleep << ")" << std::endl << " --idleTime=N Sleep N milliseconds between two continuous iterations" "(default = " << defaultIdle << ")" << std::endl << - " --streams=N Instantiate N engines to use concurrently (default = " << defaultStreams << ")" << std::endl << + " --infStreams=N Instantiate N execution contexts to run inference concurrently " + "(default = " << defaultStreams << ")" << std::endl << " --exposeDMA Serialize DMA transfers to and from device (default = disabled)." << std::endl << " --noDataTransfers Disable DMA transfers to and from device (default = enabled)." << std::endl << - " --useManagedMemory Use managed memory instead of seperate host and device allocations (default = disabled)." << std::endl << + " --useManagedMemory Use managed memory instead of separate host and device allocations (default = disabled)." << std::endl << " --useSpinWait Actively synchronize on GPU events. This option may decrease synchronization time but " "increase CPU usage and power (default = disabled)" << std::endl << " --threads Enable multithreading to drive engines with independent threads" @@ -1677,42 +2655,84 @@ void InferenceOptions::help(std::ostream& os) " --timeRefit Time the amount of time it takes to refit the engine before inference." << std::endl << " --separateProfileRun Do not attach the profiler in the benchmark run; if profiling is enabled, a second " "profile run will be executed (default = disabled)" << std::endl << - " --buildOnly Skip inference perf measurement (default = disabled)" << std::endl; + " --skipInference Exit after the engine has been built and skip inference perf measurement " + "(default = disabled)" << std::endl << + " --persistentCacheRatio Set the persistentCacheLimit in ratio, 0.5 represent half of max persistent L2 size " + "(default = 0)" << std::endl << + " --useProfile Set the optimization profile for the inference context " + "(default = " << defaultOptProfileIndex << " )." << std::endl << + " --allocationStrategy=spec Specify how the internal device memory for inference is allocated." << std::endl << + R"( Strategy: spec ::= "static", "profile", "runtime")" << std::endl << + " static = Allocate device memory based on max size across all profiles." << std::endl << + " profile = Allocate device memory based on max size of the current profile." << std::endl << + " runtime = Allocate device memory based on the actual input shapes." << std::endl << + " --saveDebugTensors Specify list of names of tensors to turn on the debug state" << std::endl << + " and filename to save raw outputs to." << std::endl << + " These tensors must be specified as debug tensors during build time." << std::endl << + R"( Input values spec ::= Ival[","spec])" << std::endl << + R"( Ival ::= name":"file)" << std::endl << + " --weightStreamingBudget Set the maximum amount of GPU memory TensorRT is allowed to use for weights." << std::endl << + " It can take on the following values:" << std::endl << + " -2: (default) Disable weight streaming at runtime." << std::endl << + " -1: TensorRT will automatically decide the budget." << std::endl << + " 0-100%: Percentage of streamable weights that reside on the GPU." << std::endl << + " 0% saves the most memory but will have the worst performance." << std::endl << + " Requires the % character." << std::endl << + " >=0B: The exact amount of streamable weights that reside on the GPU. Supports the " << std::endl << + " following base-2 suffixes: " << getAvailableUnitSuffixes() << "." << std::endl; // clang-format on } void ReportingOptions::help(std::ostream& os) { -// clang-format off + // clang-format off os << "=== Reporting Options ===" << std::endl << " --verbose Use verbose logging (default = false)" << std::endl << " --avgRuns=N Report performance measurements averaged over N consecutive " "iterations (default = " << defaultAvgRuns << ")" << std::endl << - " --percentile=P Report performance for the P percentage (0<=P<=100, 0 " + " --percentile=P1,P2,P3,... Report performance for the P1,P2,P3,... percentages (0<=P_i<=100, 0 " "representing max perf, and 100 representing min perf; (default" - " = " << defaultPercentile << "%)" << std::endl << + " = " << joinValuesToString(defaultPercentiles, ",") << "%)" << std::endl << " --dumpRefit Print the refittable layers and weights from a refittable " "engine" << std::endl << " --dumpOutput Print the output tensor(s) of the last inference iteration " "(default = disabled)" << std::endl << + " --dumpRawBindingsToFile Print the input/output tensor(s) of the last inference iteration to file" + "(default = disabled)" << std::endl << " --dumpProfile Print profile information per layer (default = disabled)" << std::endl << " --dumpLayerInfo Print layer information of the engine to console " "(default = disabled)" << std::endl << + " --dumpOptimizationProfile Print the optimization profile(s) information " + "(default = disabled)" << std::endl << " --exportTimes= Write the timing results in a json file (default = disabled)" << std::endl << " --exportOutput= Write the output tensors to a json file (default = disabled)" << std::endl << " --exportProfile= Write the profile information per layer in a json file " "(default = disabled)" << std::endl << " --exportLayerInfo= Write the layer information of the engine in a json file " "(default = disabled)" << std::endl; -// clang-format on + // clang-format on +} + +void TaskInferenceOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Task Inference Options ===" << std::endl << + " engine= Specify a serialized engine for this task" << std::endl << + " device=N Specify a GPU device for this task" << std::endl << + " DLACore=N Specify a DLACore for this task" << std::endl << + " batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << + " This option should not be used for explicit batch engines" << std::endl << + " graph=1 Use cuda graph for this task" << std::endl << + " persistentCacheRatio=[0-1] Set the persistentCacheLimit ratio for this task (default = 0)" << std::endl; + // clang-format on } void helpHelp(std::ostream& os) { -// clang-format off + // clang-format off os << "=== Help ===" << std::endl << " --help, -h Print this message" << std::endl; -// clang-format on + // clang-format on } void AllOptions::help(std::ostream& os) @@ -1723,19 +2743,6 @@ void AllOptions::help(std::ostream& os) os << std::endl; InferenceOptions::help(os); os << std::endl; -// clang-format off - os << "=== Build and Inference Batch Options ===" << std::endl << - " When using implicit batch, the max batch size of the engine, if not given, " << std::endl << - " is set to the inference batch size;" << std::endl << - " when using explicit batch, if shapes are specified only for inference, they " << std::endl << - " will be used also as min/opt/max in the build profile; if shapes are " << std::endl << - " specified only for the build, the opt shapes will be used also for inference;" << std::endl << - " if both are specified, they must be compatible; and if explicit batch is " << std::endl << - " enabled but neither is specified, the model must provide complete static" << std::endl << - " dimensions, including batch size, for all inputs" << std::endl << - " Using ONNX models automatically forces explicit batch." << std::endl << - std::endl; - // clang-format on ReportingOptions::help(os); os << std::endl; SystemOptions::help(os); @@ -1745,7 +2752,7 @@ void AllOptions::help(std::ostream& os) void SafeBuilderOptions::printHelp(std::ostream& os) { -// clang-format off + // clang-format off os << "=== Mandatory ===" << std::endl << " --onnx= ONNX model" << std::endl << " " << std::endl << @@ -1759,20 +2766,34 @@ void SafeBuilderOptions::printHelp(std::ostream& os) " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << " outputs following the same order as network outputs ID (even if only one output" << std::endl << " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << - " IO Formats: spec ::= IOfmt[\",\"spec]" << std::endl << + R"( IO Formats: spec ::= IOfmt[","spec])" << std::endl << " IOfmt ::= type:fmt" << std::endl << - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" << std::endl << - " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl << + R"( type ::= "fp32"|"fp16"|"int32"|"int8")" << std::endl << + R"( fmt ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)" << std::endl << + R"( "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])" << std::endl << " --int8 Enable int8 precision, in addition to fp16 (default = disabled)" << std::endl << - " --consistency Enable consistency check for serialized engine, (default = disabled)" << std::endl << " --std Build standard serialized engine, (default = disabled)" << std::endl << " --calib= Read INT8 calibration cache file" << std::endl << " --serialized= Save the serialized network" << std::endl << - " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl << +#if !TRT_WINML + " --staticPlugins Plugin library (.so) to load statically (can be specified multiple times)" << std::endl << +#endif " --verbose or -v Use verbose logging (default = false)" << std::endl << " --help or -h Print this message" << std::endl << - " " << std::endl; -// clang-format on + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" << std::endl << + " --timingCacheFile= Save/load the serialized global timing cache" << std::endl << + " --sparsity=spec Control sparsity (default = disabled). " << std::endl << + R"( Sparsity: spec ::= "disable", "enable", "force")" << std::endl << + " Note: Description about each of these options is as below" << std::endl << + " disable = do not enable sparse tactics in the builder (this is the default)" << std::endl << + " enable = enable sparse tactics in the builder (but these tactics will only be" << std::endl << + " considered if the weights have the right sparsity pattern)" << std::endl << + " force = enable sparse tactics in the builder and force-overwrite the weights to have" << std::endl << + " a sparsity pattern" << std::endl << + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " << std::endl << + "" << defaultAvgTiming << ")" << std::endl << + "" << std::endl; + // clang-format on } } // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleOptions.h b/src/Detector/tensorrt_yolo/common/sampleOptions.h index 8975e1ea..8ca0a655 100644 --- a/src/Detector/tensorrt_yolo/common/sampleOptions.h +++ b/src/Detector/tensorrt_yolo/common/sampleOptions.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -23,6 +24,7 @@ #include #include #include +#include #include #include @@ -32,9 +34,10 @@ namespace sample { // Build default params -constexpr int32_t maxBatchNotProvided{0}; -constexpr int32_t defaultMinTiming{1}; constexpr int32_t defaultAvgTiming{8}; +constexpr int32_t defaultMaxAuxStreams{-1}; +constexpr int32_t defaultBuilderOptimizationLevel{-1}; +constexpr int32_t defaultMaxTactics{-1}; // System default params constexpr int32_t defaultDevice{0}; @@ -44,14 +47,16 @@ constexpr int32_t defaultBatch{1}; constexpr int32_t batchNotProvided{0}; constexpr int32_t defaultStreams{1}; constexpr int32_t defaultIterations{10}; +constexpr int32_t defaultOptProfileIndex{0}; constexpr float defaultWarmUp{200.F}; constexpr float defaultDuration{3.F}; constexpr float defaultSleep{}; constexpr float defaultIdle{}; +constexpr float defaultPersistentCacheRatio{0}; // Reporting default params constexpr int32_t defaultAvgRuns{10}; -constexpr float defaultPercentile{99}; +constexpr std::array defaultPercentiles{90, 95, 99}; enum class PrecisionConstraints { @@ -63,9 +68,7 @@ enum class PrecisionConstraints enum class ModelFormat { kANY, - kCAFFE, - kONNX, - kUFF + kONNX }; enum class SparsityFlag @@ -82,7 +85,55 @@ enum class TimingCacheMode kGLOBAL }; -using Arguments = std::unordered_multimap; +enum class MemoryAllocationStrategy +{ + kSTATIC, //< Allocate device memory based on max size across all profiles. + kPROFILE, //< Allocate device memory based on max size of the current profile. + kRUNTIME, //< Allocate device memory based on the current input shapes. +}; + +//! +//! \enum RuntimeMode +//! +//! \brief Used to dictate which TensorRT runtime library to dynamically load. +//! +enum class RuntimeMode +{ + //! Maps to libnvinfer.so or nvinfer.dll + kFULL, + + //! Maps to libnvinfer_dispatch.so or nvinfer_dispatch.dll + kDISPATCH, + + //! Maps to libnvinfer_lean.so or nvinfer_lean.dll + kLEAN, +}; + +inline std::ostream& operator<<(std::ostream& os, RuntimeMode const mode) +{ + switch (mode) + { + case RuntimeMode::kFULL: + { + os << "full"; + break; + } + case RuntimeMode::kDISPATCH: + { + os << "dispatch"; + break; + } + case RuntimeMode::kLEAN: + { + os << "lean"; + break; + } + } + + return os; +} + +using Arguments = std::unordered_multimap>; using IOFormat = std::pair; @@ -90,135 +141,201 @@ using ShapeRange = std::array, nvinfer1::EnumMax; using LayerOutputTypes = std::unordered_map>; +using LayerDeviceTypes = std::unordered_map; -struct Options -{ - virtual void parse(Arguments& arguments) = 0; -}; +using StringSet = std::unordered_set; -struct BaseModelOptions : public Options +class WeightStreamingBudget { - ModelFormat format{ModelFormat::kANY}; - std::string model; +public: + static constexpr int64_t kDISABLE{-2}; + static constexpr int64_t kAUTOMATIC{-1}; + int64_t bytes{kDISABLE}; + double percent{static_cast(100.0)}; - void parse(Arguments& arguments) override; + bool isDisabled() + { + return bytes == kDISABLE && percent == kDISABLE; + } +}; - static void help(std::ostream& out); +class Options +{ +public: + virtual ~Options() = default; + virtual void parse(Arguments& arguments) = 0; }; -struct UffInput : public Options +class BaseModelOptions : public Options { - std::vector> inputs; - bool NHWC{false}; +public: + ModelFormat format{ModelFormat::kANY}; + std::string model; void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct ModelOptions : public Options +class ModelOptions : public Options { +public: BaseModelOptions baseModel; std::string prototxt; std::vector outputs; - UffInput uffInputs; void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct BuildOptions : public Options +constexpr nvinfer1::TempfileControlFlags getTempfileControlDefaults() { - int32_t maxBatch{maxBatchNotProvided}; + using F = nvinfer1::TempfileControlFlag; + return (1U << static_cast(F::kALLOW_TEMPORARY_FILES)) + | (1U << static_cast(F::kALLOW_IN_MEMORY_FILES)); +} + +class BuildOptions : public Options +{ +public: + // Unit in MB. double workspace{-1.0}; + // Unit in MB. double dlaSRAM{-1.0}; + // Unit in MB. double dlaLocalDRAM{-1.0}; + // Unit in MB. double dlaGlobalDRAM{-1.0}; - int32_t minTiming{defaultMinTiming}; + // Unit in KB. + double tacticSharedMem{-1.0}; int32_t avgTiming{defaultAvgTiming}; + size_t calibProfile{defaultOptProfileIndex}; bool tf32{true}; bool fp16{false}; + bool bf16{false}; bool int8{false}; + bool fp8{false}; + bool int4{false}; + bool stronglyTyped{false}; bool directIO{false}; PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE}; LayerPrecisions layerPrecisions; LayerOutputTypes layerOutputTypes; + LayerDeviceTypes layerDeviceTypes; + StringSet debugTensors; + StringSet debugTensorStates; bool safe{false}; - bool consistency{false}; + bool buildDLAStandalone{false}; + bool allowGPUFallback{false}; bool restricted{false}; + bool skipInference{false}; bool save{false}; bool load{false}; bool refittable{false}; + bool stripWeights{false}; + bool versionCompatible{false}; + bool pluginInstanceNorm{false}; + bool excludeLeanRuntime{false}; + bool disableCompilationCache{false}; + int32_t builderOptimizationLevel{defaultBuilderOptimizationLevel}; + int32_t maxTactics{defaultMaxTactics}; SparsityFlag sparsity{SparsityFlag::kDISABLE}; -#if (NV_TENSORRT_MAJOR > 7) - nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; -#else - nvinfer1::ProfilingVerbosity profilingVerbosity{ nvinfer1::ProfilingVerbosity::kDEFAULT }; -#endif + nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; std::string engine; std::string calibration; - std::unordered_map shapes; - std::unordered_map shapesCalib; + using ShapeProfile = std::unordered_map; + std::vector optProfiles; + ShapeProfile shapesCalib; std::vector inputFormats; std::vector outputFormats; nvinfer1::TacticSources enabledTactics{0}; nvinfer1::TacticSources disabledTactics{0}; TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; std::string timingCacheFile{}; + bool errorOnTimingCacheMiss{false}; + // C++11 does not automatically generate hash function for enum class. + // Use int32_t to support C++11 compilers. + std::unordered_map previewFeatures; + nvinfer1::HardwareCompatibilityLevel hardwareCompatibilityLevel{nvinfer1::HardwareCompatibilityLevel::kNONE}; + nvinfer1::RuntimePlatform runtimePlatform{nvinfer1::RuntimePlatform::kSAME_AS_BUILD}; + std::string tempdir{}; + nvinfer1::TempfileControlFlags tempfileControls{getTempfileControlDefaults()}; + RuntimeMode useRuntime{RuntimeMode::kFULL}; + std::string leanDLLPath{}; + int32_t maxAuxStreams{defaultMaxAuxStreams}; + bool getPlanVersionOnly{false}; + + bool allowWeightStreaming{false}; + void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct SystemOptions : public Options +class SystemOptions : public Options { +public: int32_t device{defaultDevice}; int32_t DLACore{-1}; - bool fallback{false}; + bool ignoreParsedPluginLibs{false}; std::vector plugins; + std::vector setPluginsToSerialize; + std::vector dynamicPlugins; void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct InferenceOptions : public Options +class InferenceOptions : public Options { +public: int32_t batch{batchNotProvided}; int32_t iterations{defaultIterations}; - int32_t streams{defaultStreams}; + int32_t infStreams{defaultStreams}; + int32_t optProfileIndex{defaultOptProfileIndex}; float warmup{defaultWarmUp}; float duration{defaultDuration}; float sleep{defaultSleep}; float idle{defaultIdle}; + float persistentCacheRatio{defaultPersistentCacheRatio}; bool overlap{true}; bool skipTransfers{false}; bool useManaged{false}; bool spin{false}; bool threads{false}; bool graph{false}; - bool skip{false}; bool rerun{false}; bool timeDeserialize{false}; bool timeRefit{false}; + bool setOptProfile{false}; std::unordered_map inputs; - std::unordered_map> shapes; + using ShapeProfile = std::unordered_map>; + ShapeProfile shapes; + nvinfer1::ProfilingVerbosity nvtxVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; + MemoryAllocationStrategy memoryAllocationStrategy{MemoryAllocationStrategy::kSTATIC}; + std::unordered_map debugTensorFileNames; + + WeightStreamingBudget weightStreamingBudget; void parse(Arguments& arguments) override; static void help(std::ostream& out); }; -struct ReportingOptions : public Options +class ReportingOptions : public Options { +public: bool verbose{false}; int32_t avgs{defaultAvgRuns}; - float percentile{defaultPercentile}; + std::vector percentiles{defaultPercentiles.begin(), defaultPercentiles.end()}; bool refit{false}; bool output{false}; + bool dumpRawBindings{false}; bool profile{false}; bool layerInfo{false}; + bool optProfileInfo{false}; std::string exportTimes; std::string exportOutput; std::string exportProfile; @@ -229,8 +346,9 @@ struct ReportingOptions : public Options static void help(std::ostream& out); }; -struct SafeBuilderOptions : public Options +class SafeBuilderOptions : public Options { +public: std::string serialized{}; std::string onnxModelFile{}; bool help{false}; @@ -238,18 +356,24 @@ struct SafeBuilderOptions : public Options std::vector inputFormats; std::vector outputFormats; bool int8{false}; + bool fp8{false}; + bool int4{false}; std::string calibFile{}; std::vector plugins; - bool consistency{false}; bool standard{false}; + TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; + std::string timingCacheFile{}; + SparsityFlag sparsity{SparsityFlag::kDISABLE}; + int32_t avgTiming{defaultAvgTiming}; void parse(Arguments& arguments) override; static void printHelp(std::ostream& out); }; -struct AllOptions : public Options +class AllOptions : public Options { +public: ModelOptions model; BuildOptions build; SystemOptions system; @@ -262,6 +386,20 @@ struct AllOptions : public Options static void help(std::ostream& out); }; +class TaskInferenceOptions : public Options +{ +public: + std::string engine; + int32_t device{defaultDevice}; + int32_t DLACore{-1}; + int32_t batch{batchNotProvided}; + bool graph{false}; + float persistentCacheRatio{defaultPersistentCacheRatio}; + void parse(Arguments& arguments) override; + static void help(std::ostream& out); +}; + + Arguments argsToArgumentsMap(int32_t argc, char* argv[]); bool parseHelp(Arguments& arguments); @@ -272,8 +410,6 @@ void helpHelp(std::ostream& out); std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options); -std::ostream& operator<<(std::ostream& os, const UffInput& input); - std::ostream& operator<<(std::ostream& os, const IOFormat& format); std::ostream& operator<<(std::ostream& os, const ShapeRange& dims); @@ -292,6 +428,10 @@ std::ostream& operator<<(std::ostream& os, const AllOptions& options); std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options); +std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype); + +std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType); + inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) { for (int32_t i = 0; i < dims.nbDims; ++i) @@ -329,13 +469,11 @@ inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole ro os << "Constant"; break; } -#if (NV_TENSORRT_MAJOR > 7) case nvinfer1::WeightsRole::kANY: { os << "Any"; break; } -#endif } return os; diff --git a/src/Detector/tensorrt_yolo/common/sampleReporting.cpp b/src/Detector/tensorrt_yolo/common/sampleReporting.cpp index a92938c5..e9dda6e0 100644 --- a/src/Detector/tensorrt_yolo/common/sampleReporting.cpp +++ b/src/Detector/tensorrt_yolo/common/sampleReporting.cpp @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -26,6 +27,8 @@ #include "sampleOptions.h" #include "sampleReporting.h" +using namespace nvinfer1; + namespace sample { @@ -45,7 +48,7 @@ float findPercentile(float percentile, std::vector const& timings { return std::numeric_limits::infinity(); } - if (percentile < 0.0f || percentile > 100.0f) + if (percentile < 0.F || percentile > 100.F) { throw std::runtime_error("percentile is not in [0, 100]!"); } @@ -99,8 +102,26 @@ float findCoeffOfVariance(std::vector const& timings, T const& to inline InferenceTime traceToTiming(const InferenceTrace& a) { - return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), - (a.d2hEnd - a.d2hStart), (a.d2hEnd - a.h2dStart)); + return InferenceTime( + (a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), (a.d2hEnd - a.d2hStart)); +} + +inline std::string dimsToString(Dims const& shape) +{ + std::stringstream ss; + + if (shape.nbDims == 0) + { + ss << "scalar"; + } + else + { + for (int32_t i = 0; i < shape.nbDims; i++) + { + ss << shape.d[i] << (i != shape.nbDims - 1 ? "x" : ""); + } + } + return ss.str(); } } // namespace @@ -113,29 +134,40 @@ void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTi void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os) { - int32_t count = 0; + int64_t count = 0; InferenceTime sum; os << std::endl; os << "=== Trace details ===" << std::endl; os << "Trace averages of " << runsPerAvg << " runs:" << std::endl; - for (auto const& t : timings) + + // Show only the first N lines and the last N lines, where N = kTIMING_PRINT_THRESHOLD. + constexpr int64_t kTIMING_PRINT_THRESHOLD{200}; + int64_t const maxNbTimings{kTIMING_PRINT_THRESHOLD * runsPerAvg}; + + for (int64_t idx = 0, size = timings.size(); idx < size; ++idx) { - sum += t; + // Omit some latency printing to avoid very long logs. + if (size > 2 * maxNbTimings && idx == maxNbTimings) + { + os << "... Omitting " << (size - 2 * maxNbTimings) << " lines" << std::endl; + idx = size - kTIMING_PRINT_THRESHOLD * runsPerAvg - 1; + } + + sum += timings[idx]; if (++count == runsPerAvg) { // clang-format off os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg - << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg - << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl; + << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (enqueue " << sum.enq / runsPerAvg + << " ms)" << std::endl; // clang-format on count = 0; sum.enq = 0; sum.h2d = 0; sum.compute = 0; sum.d2h = 0; - sum.e2e = 0; } } } @@ -166,14 +198,10 @@ void printMetricExplanations(std::ostream& os) os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a " "single query." << std::endl; - os << "End-to-End Host Latency: the duration from when the H2D of a query is called to when the D2H of the same " - "query is completed, which includes the latency to wait for the completion of the previous query. This is " - "the latency of a query if multiple queries are enqueued consecutively." - << std::endl; } PerformanceResult getPerformanceResult(std::vector const& timings, - std::function metricGetter, float percentile) + std::function metricGetter, std::vector const& percentiles) { auto const metricComparator = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); }; @@ -183,40 +211,44 @@ PerformanceResult getPerformanceResult(std::vector const& timings PerformanceResult result; result.min = metricGetter(newTimings.front()); result.max = metricGetter(newTimings.back()); - result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size(); + result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0F, metricAccumulator) / newTimings.size(); result.median = findMedian(newTimings, metricGetter); - result.percentile = findPercentile(percentile, newTimings, metricGetter); + for (auto percentile : percentiles) + { + result.percentiles.emplace_back(findPercentile(percentile, newTimings, metricGetter)); + } result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean); return result; } -void printEpilog(std::vector const& timings, float walltimeMs, float percentile, int32_t batchSize, - std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +void printEpilog(std::vector const& timings, float walltimeMs, std::vector const& percentiles, + int32_t batchSize, int32_t infStreams, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) { float const throughput = batchSize * timings.size() / walltimeMs * 1000; auto const getLatency = [](InferenceTime const& t) { return t.latency(); }; - auto const latencyResult = getPerformanceResult(timings, getLatency, percentile); - - auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; }; - auto const e2eLatencyResult = getPerformanceResult(timings, getEndToEnd, percentile); + auto const latencyResult = getPerformanceResult(timings, getLatency, percentiles); auto const getEnqueue = [](InferenceTime const& t) { return t.enq; }; - auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentile); + auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentiles); auto const getH2d = [](InferenceTime const& t) { return t.h2d; }; - auto const h2dResult = getPerformanceResult(timings, getH2d, percentile); + auto const h2dResult = getPerformanceResult(timings, getH2d, percentiles); auto const getCompute = [](InferenceTime const& t) { return t.compute; }; - auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentile); + auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentiles); auto const getD2h = [](InferenceTime const& t) { return t.d2h; }; - auto const d2hResult = getPerformanceResult(timings, getD2h, percentile); + auto const d2hResult = getPerformanceResult(timings, getD2h, percentiles); - auto const toPerfString = [percentile](const PerformanceResult& r) { + auto const toPerfString = [&](const PerformanceResult& r) { std::stringstream s; s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, " - << "median = " << r.median << " ms, percentile(" << percentile << "%) = " << r.percentile << " ms"; + << "median = " << r.median << " ms"; + for (int32_t i = 0, n = percentiles.size(); i < n; ++i) + { + s << ", percentile(" << percentiles[i] << "%) = " << r.percentiles[i] << " ms"; + } return s.str(); }; @@ -224,7 +256,6 @@ void printEpilog(std::vector const& timings, float walltimeMs, fl osInfo << "=== Performance summary ===" << std::endl; osInfo << "Throughput: " << throughput << " qps" << std::endl; osInfo << "Latency: " << toPerfString(latencyResult) << std::endl; - osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) << std::endl; osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl; osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl; osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl; @@ -268,6 +299,13 @@ void printEpilog(std::vector const& timings, float walltimeMs, fl << "stability." << std::endl; } + // Report warnings if multiple inference streams are used. + if (infStreams > 1) + { + osWarning << "* Multiple inference streams are used. Latencies may not be accurate since inferences may run in " + << " parallel. Please use \"Throughput\" as the performance metric instead." << std::endl; + } + // Explain what the metrics mean. osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl; printMetricExplanations(osVerbose); @@ -275,27 +313,28 @@ void printEpilog(std::vector const& timings, float walltimeMs, fl osInfo << std::endl; } -void printPerformanceReport(std::vector const& trace, const ReportingOptions& reporting, float warmupMs, - int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +void printPerformanceReport(std::vector const& trace, ReportingOptions const& reportingOpts, + InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) { + int32_t batchSize = infOpts.batch; + float const warmupMs = infOpts.warmup; auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; }; auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup); int32_t const warmups = noWarmup - trace.begin(); float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart; - // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch - // when explicit batch used, batchSize = options.inference.batch = 0 // treat inference with explicit batch as a single query and report the throughput batchSize = batchSize ? batchSize : 1; printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo); std::vector timings(trace.size() - warmups); std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming); - printTiming(timings, reporting.avgs, osInfo); - printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, osWarning, osVerbose); + printTiming(timings, reportingOpts.avgs, osInfo); + printEpilog( + timings, benchTime, reportingOpts.percentiles, batchSize, infOpts.infStreams, osInfo, osWarning, osVerbose); - if (!reporting.exportTimes.empty()) + if (!reportingOpts.exportTimes.empty()) { - exportJSONTrace(trace, reporting.exportTimes); + exportJSONTrace(trace, reportingOpts.exportTimes, warmups); } } @@ -303,15 +342,16 @@ void printPerformanceReport(std::vector const& trace, const Repo //! [ value, ...] //! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time, //! "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time, -//! "d2h" : time, "latency" : time, "end to end" : time } +//! "d2h" : time, "latency" : time } //! -void exportJSONTrace(std::vector const& trace, std::string const& fileName) +void exportJSONTrace(std::vector const& trace, std::string const& fileName, int32_t const nbWarmups) { std::ofstream os(fileName, std::ofstream::trunc); os << "[" << std::endl; char const* sep = " "; - for (auto const& t : trace) + for (auto iter = trace.begin() + nbWarmups; iter < trace.end(); ++iter) { + auto const& t = *iter; InferenceTime const it(traceToTiming(t)); os << sep << "{ "; sep = ", "; @@ -321,8 +361,8 @@ void exportJSONTrace(std::vector const& trace, std::string const << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep << "\"startD2hMs\" : " << t.d2hStart << sep << "\"endD2hMs\" : " << t.d2hEnd << sep << "\"h2dMs\" : " << it.h2d << sep << "\"computeMs\" : " << it.compute << sep - << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << sep - << "\"endToEndMs\" : " << it.e2e << " }" << std::endl; + << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << " }" + << std::endl; // clang-format on } os << "]" << std::endl; @@ -346,42 +386,49 @@ void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept } } - mIterator->timeMs += timeMs; + mIterator->timeMs.push_back(timeMs); ++mIterator; } void Profiler::print(std::ostream& os) const noexcept { - std::string const nameHdr("Layer"); - std::string const timeHdr(" Time (ms)"); - std::string const avgHdr(" Avg. Time (ms)"); - std::string const percentageHdr(" Time %"); + std::string const nameHdr(" Layer"); + std::string const timeHdr(" Time(ms)"); + std::string const avgHdr(" Avg.(ms)"); + std::string const medHdr(" Median(ms)"); + std::string const percentageHdr(" Time(%)"); float const totalTimeMs = getTotalTime(); - auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); }; - auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer); - auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size()); auto const timeLength = timeHdr.size(); auto const avgLength = avgHdr.size(); + auto const medLength = medHdr.size(); auto const percentageLength = percentageHdr.size(); os << std::endl << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl - << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl; + << timeHdr << avgHdr << medHdr << percentageHdr << nameHdr << std::endl; for (auto const& p : mLayers) { + if (p.timeMs.empty() || getTotalTime(p) == 0.F) + { + // there is no point to print profiling for layer that didn't run at all + continue; + } // clang-format off - os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs - << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount - << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100 - << std::endl; + os << std::setw(timeLength) << std::fixed << std::setprecision(2) << getTotalTime(p) + << std::setw(avgLength) << std::fixed << std::setprecision(4) << getAvgTime(p) + << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime(p) + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << getTotalTime(p) / totalTimeMs * 100 + << " " << p.name << std::endl; } { - os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2) + os << std::setw(timeLength) << std::fixed << std::setprecision(2) << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount - << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl; + << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime() + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 + << " Total" << std::endl; // clang-format on } os << std::endl; @@ -397,10 +444,11 @@ void Profiler::exportJSONProfile(std::string const& fileName) const noexcept for (auto const& l : mLayers) { // clang-format off - os << ", {" << " \"name\" : \"" << l.name << "\"" - ", \"timeMs\" : " << l.timeMs - << ", \"averageMs\" : " << l.timeMs / mUpdatesCount - << ", \"percentage\" : " << l.timeMs / totalTimeMs * 100 + os << ", {" << R"( "name" : ")" << l.name << R"(")" + R"(, "timeMs" : )" << getTotalTime(l) + << R"(, "averageMs" : )" << getAvgTime(l) + << R"(, "medianMs" : )" << getMedianTime(l) + << R"(, "percentage" : )" << getTotalTime(l) / totalTimeMs * 100 << " }" << std::endl; // clang-format on } @@ -415,8 +463,13 @@ void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bind void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) { - os << "Output Tensors:" << std::endl; - bindings.dumpOutputs(context, os); + auto isOutput = [](Binding const& b) { return !b.isInput; }; + bindings.dumpBindings(context, isOutput, os); +} + +void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) +{ + bindings.dumpRawBindingToFiles(context, os); } void exportJSONOutput( @@ -429,10 +482,10 @@ void exportJSONOutput( for (auto const& binding : output) { // clang-format off - os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl; + os << sep << R"({ "name" : ")" << binding.first << "\"" << std::endl; sep = ", "; - os << " " << sep << "\"dimensions\" : \""; - bindings.dumpBindingDimensions(binding.second, context, os); + os << " " << sep << R"("dimensions" : ")"; + bindings.dumpBindingDimensions(binding.first, context, os); os << "\"" << std::endl; os << " " << sep << "\"values\" : [ "; bindings.dumpBindingValues(context, binding.second, os, sep, batch); @@ -442,4 +495,115 @@ void exportJSONOutput( os << "]" << std::endl; } +void exportJSONOutput( + nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch); + +void printLayerInfo( + ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context) +{ + if (reporting.layerInfo) + { + sample::gLogInfo << "Layer Information:" << std::endl; + sample::gLogInfo << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kONELINE) + << std::flush; + } + if (!reporting.exportLayerInfo.empty()) + { + std::ofstream os(reporting.exportLayerInfo, std::ofstream::trunc); + os << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kJSON) << std::flush; + } +} + +void printOptimizationProfileInfo(ReportingOptions const& reporting, nvinfer1::ICudaEngine const* engine) +{ + if (reporting.optProfileInfo) + { + sample::gLogInfo << "Optimization Profile Information:" << std::endl; + for (int32_t i = 0; i < engine->getNbOptimizationProfiles(); i++) + { + for (int32_t j = 0, e = engine->getNbIOTensors(); j < e; j++) + { + auto const tensorName = engine->getIOTensorName(j); + + if (engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) + { + auto tensorMinShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kMIN); + auto tensorOptShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kOPT); + auto tensorMaxShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kMAX); + + sample::gLogInfo << "Model input " << tensorName << " (profile " << i << "): " + << "min=" << dimsToString(tensorMinShape) + << ", opt=" << dimsToString(tensorOptShape) + << ", max=" << dimsToString(tensorMaxShape) << std::endl; + } + } + } + } +} + +void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv) +{ + if (reporting.profile) + { + iEnv.profiler->print(sample::gLogInfo); + } + if (!reporting.exportProfile.empty()) + { + iEnv.profiler->exportJSONProfile(reporting.exportProfile); + } + + // Print an warning about total per-layer latency when auxiliary streams are used. + if (!iEnv.safe && (reporting.profile || !reporting.exportProfile.empty())) + { + int32_t const nbAuxStreams = iEnv.engine.get()->getNbAuxStreams(); + if (nbAuxStreams > 0) + { + sample::gLogWarning << "The engine uses " << nbAuxStreams << " auxiliary streams, so the \"Total\" latency " + << "may not be accurate because some layers may have run in parallel!" << std::endl; + } + } +} + +namespace details +{ +void dump(std::unique_ptr const& context, std::unique_ptr const& binding, + ReportingOptions const& reporting, int32_t batch) +{ + if (!context) + { + sample::gLogError << "Empty context! Skip printing outputs." << std::endl; + return; + } + if (reporting.output) + { + dumpOutputs(*context, *binding, sample::gLogInfo); + } + if (reporting.dumpRawBindings) + { + dumpRawBindingsToFiles(*context, *binding, sample::gLogInfo); + } + if (!reporting.exportOutput.empty()) + { + exportJSONOutput(*context, *binding, reporting.exportOutput, batch); + } +} +} // namespace details + +void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch) +{ + auto const& binding = iEnv.bindings.at(0); + if (!binding) + { + sample::gLogError << "Empty bindings! Skip printing outputs." << std::endl; + return; + } + if (iEnv.safe) + { + sample::gLogError << "Safe inferernce is not supported!" << std::endl; + return; + } + auto const& context = iEnv.contexts.at(0); + details::dump(context, binding, reporting, batch); +} + } // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleReporting.h b/src/Detector/tensorrt_yolo/common/sampleReporting.h index 5f730987..922ef3c8 100644 --- a/src/Detector/tensorrt_yolo/common/sampleReporting.h +++ b/src/Detector/tensorrt_yolo/common/sampleReporting.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -19,27 +20,26 @@ #include #include - -#include "NvInfer.h" +#include #include "sampleOptions.h" -#include "sampleUtils.h" namespace sample { +class Bindings; + //! //! \struct InferenceTime //! \brief Measurement times in milliseconds //! struct InferenceTime { - InferenceTime(float q, float i, float c, float o, float e) + InferenceTime(float q, float i, float c, float o) : enq(q) , h2d(i) , compute(c) , d2h(o) - , e2e(e) { } @@ -54,7 +54,6 @@ struct InferenceTime float h2d{0}; // Host to Device float compute{0}; // Compute float d2h{0}; // Device to Host - float e2e{0}; // end to end // ideal latency float latency() const @@ -102,7 +101,7 @@ struct InferenceTrace inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b) { - return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h, a.e2e + b.e2e); + return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h); } inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) @@ -116,12 +115,12 @@ inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) //! struct PerformanceResult { - float min{0}; - float max{0}; - float mean{0}; - float median{0}; - float percentile{0}; - float coeffVar{0}; // coefficient of variation + float min{0.F}; + float max{0.F}; + float mean{0.F}; + float median{0.F}; + std::vector percentiles; + float coeffVar{0.F}; // coefficient of variation }; //! @@ -137,14 +136,14 @@ void printTiming(std::vector const& timings, int32_t runsPerAvg, //! //! \brief Print the performance summary of a trace //! -void printEpilog(std::vector const& timings, float percentile, int32_t batchSize, std::ostream& osInfo, - std::ostream& osWarning, std::ostream& osVerbose); +void printEpilog(std::vector const& timings, std::vector const& percentiles, int32_t batchSize, + std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); //! //! \brief Get the result of a specific performance metric from a trace //! PerformanceResult getPerformanceResult(std::vector const& timings, - std::function metricGetter, float percentile); + std::function metricGetter, std::vector const& percentiles); //! //! \brief Print the explanations of the performance metrics printed in printEpilog() function. @@ -154,13 +153,14 @@ void printMetricExplanations(std::ostream& os); //! //! \brief Print and summarize a timing trace //! -void printPerformanceReport(std::vector const& trace, ReportingOptions const& reporting, float warmupMs, - int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); +void printPerformanceReport(std::vector const& trace, ReportingOptions const& reportingOpts, + InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); //! //! \brief Export a timing trace to JSON file //! -void exportJSONTrace(std::vector const& trace, std::string const& fileName); +void exportJSONTrace( + std::vector const& InferenceTime, std::string const& fileName, int32_t const nbWarmups); //! //! \brief Print input tensors to stream @@ -172,6 +172,8 @@ void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bind //! void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); +void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + //! //! \brief Export output tensors to JSON file //! @@ -185,7 +187,7 @@ void exportJSONOutput( struct LayerProfile { std::string name; - float timeMs{0}; + std::vector timeMs; }; //! @@ -208,8 +210,58 @@ class Profiler : public nvinfer1::IProfiler private: float getTotalTime() const noexcept { - auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs; }; - return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime); + auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { + return accumulator + std::accumulate(lp.timeMs.begin(), lp.timeMs.end(), 0.F, std::plus()); + }; + return std::accumulate(mLayers.begin(), mLayers.end(), 0.0F, plusLayerTime); + } + + float getMedianTime() const noexcept + { + if (mLayers.empty()) + { + return 0.F; + } + std::vector totalTime; + for (size_t run = 0; run < mLayers[0].timeMs.size(); ++run) + { + auto const layerTime + = [&run](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs[run]; }; + auto t = std::accumulate(mLayers.begin(), mLayers.end(), 0.F, layerTime); + totalTime.push_back(t); + } + return median(totalTime); + } + + float getMedianTime(LayerProfile const& p) const noexcept + { + return median(p.timeMs); + } + + static float median(std::vector vals) + { + if (vals.empty()) + { + return 0.F; + } + std::sort(vals.begin(), vals.end()); + if (vals.size() % 2U == 1U) + { + return vals[vals.size() / 2U]; + } + return (vals[vals.size() / 2U - 1U] + vals[vals.size() / 2U]) * 0.5F; + } + + //! return the total runtime of given layer profile + float getTotalTime(LayerProfile const& p) const noexcept + { + auto const& vals = p.timeMs; + return std::accumulate(vals.begin(), vals.end(), 0.F, std::plus()); + } + + float getAvgTime(LayerProfile const& p) const noexcept + { + return getTotalTime(p) / p.timeMs.size(); } std::vector mLayers; @@ -217,6 +269,30 @@ class Profiler : public nvinfer1::IProfiler int32_t mUpdatesCount{0}; }; +//! +//! \brief Print layer info to logger or export it to output JSON file. +//! +void printLayerInfo( + ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context); + +//! +//! \brief Print optimization profile info to logger. +//! +void printOptimizationProfileInfo(ReportingOptions const& reporting, nvinfer1::ICudaEngine const* engine); + +//! Forward declaration. +struct InferenceEnvironment; + +//! +//! \brief Print per-layer perf profile data to logger or export it to output JSON file. +//! +void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv); + +//! +//! \brief Print binding output values to logger or export them to output JSON file. +//! +void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch); + } // namespace sample #endif // TRT_SAMPLE_REPORTING_H diff --git a/src/Detector/tensorrt_yolo/common/sampleUtils.cpp b/src/Detector/tensorrt_yolo/common/sampleUtils.cpp new file mode 100644 index 00000000..689e5857 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/sampleUtils.cpp @@ -0,0 +1,587 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sampleUtils.h" +#include "bfloat16.h" +#include "half.h" + +using namespace nvinfer1; + +namespace sample +{ + +size_t dataTypeSize(nvinfer1::DataType dataType) +{ + switch (dataType) + { + case nvinfer1::DataType::kINT64: return 8U; + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: return 4U; + case nvinfer1::DataType::kBF16: + case nvinfer1::DataType::kHALF: return 2U; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kUINT8: + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kFP8: return 1U; + case nvinfer1::DataType::kINT4: + ASSERT(false && "Element size is not implemented for sub-byte data-types."); + } + return 0; +} + +int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch) +{ + int64_t maxNbElems = 1; + for (int32_t i = 0; i < dims.nbDims; ++i) + { + // Get effective length of axis. + int64_t d = dims.d[i]; + // Any dimension is 0, it is an empty tensor. + if (d == 0) + { + return 0; + } + if (i == vecDim) + { + d = samplesCommon::divUp(d, comps); + } + maxNbElems = std::max(maxNbElems, d * strides.d[i]); + } + return maxNbElems * batch * (vecDim < 0 ? 1 : comps); +} + +nvinfer1::Dims toDims(std::vector const& vec) +{ + int32_t limit = static_cast(nvinfer1::Dims::MAX_DIMS); + if (static_cast(vec.size()) > limit) + { + sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; + } + // Pick first nvinfer1::Dims::MAX_DIMS elements + nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; + std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); + return dims; +} + +void loadFromFile(std::string const& fileName, char* dst, size_t size) +{ + ASSERT(dst); + + std::ifstream file(fileName, std::ios::in | std::ios::binary); + if (file.is_open()) + { + file.seekg(0, std::ios::end); + int64_t fileSize = static_cast(file.tellg()); + // Due to change from int32_t to int64_t VC engines created with earlier versions + // may expect input of the half of the size + if (fileSize != static_cast(size) && fileSize != static_cast(size * 2)) + { + std::ostringstream msg; + msg << "Unexpected file size for input file: " << fileName << ". Note: Input binding size is: " << size + << " bytes but the file size is " << fileSize + << " bytes. Double check the size and datatype of the provided data."; + throw std::invalid_argument(msg.str()); + } + // Move file pointer back to the beginning after reading file size. + file.seekg(0, std::ios::beg); + file.read(dst, size); + size_t const nbBytesRead = file.gcount(); + file.close(); + if (nbBytesRead != size) + { + std::ostringstream msg; + msg << "Unexpected file size for input file: " << fileName << ". Note: Expected: " << size + << " bytes but only read: " << nbBytesRead << " bytes"; + throw std::invalid_argument(msg.str()); + } + } + else + { + std::ostringstream msg; + msg << "Cannot open file " << fileName << "!"; + throw std::invalid_argument(msg.str()); + } +} + +std::vector splitToStringVec(std::string const& s, char separator, int64_t maxSplit) +{ + std::vector splitted; + + for (size_t start = 0; start < s.length();) + { + // If maxSplit is specified and we have reached maxSplit, emplace back the rest of the string and break the + // loop. + if (maxSplit >= 0 && static_cast(splitted.size()) == maxSplit) + { + splitted.emplace_back(s.substr(start, s.length() - start)); + break; + } + + size_t separatorIndex = s.find(separator, start); + if (separatorIndex == std::string::npos) + { + separatorIndex = s.length(); + } + splitted.emplace_back(s.substr(start, separatorIndex - start)); + + // If the separator is the last character, then we should push an empty string at the end. + if (separatorIndex == s.length() - 1) + { + splitted.emplace_back(""); + } + + start = separatorIndex + 1; + } + + return splitted; +} + +bool broadcastIOFormats(std::vector const& formats, size_t nbBindings, bool isInput /*= true*/) +{ + bool broadcast = formats.size() == 1; + bool validFormatsCount = broadcast || (formats.size() == nbBindings); + if (!formats.empty() && !validFormatsCount) + { + if (isInput) + { + throw std::invalid_argument( + "The number of inputIOFormats must match network's inputs or be one for broadcasting."); + } + + throw std::invalid_argument( + "The number of outputIOFormats must match network's outputs or be one for broadcasting."); + } + return broadcast; +} + +void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) +{ + using TensorToLayer = std::unordered_map; + using LayerToTensor = std::unordered_map; + + // 1. Collect layers and tensors information from the network. + TensorToLayer matmulI2L; + TensorToLayer constO2L; + TensorToLayer shuffleI2L; + LayerToTensor shuffleL2O; + auto collectMappingInfo = [&](int32_t const idx) + { + ILayer* l = network.getLayer(idx); + switch (l->getType()) + { + case nvinfer1::LayerType::kMATRIX_MULTIPLY: + { + // assume weights on the second input. + matmulI2L.insert({l->getInput(1), l}); + break; + } + case nvinfer1::LayerType::kCONSTANT: + { + DataType const dtype = static_cast(l)->getWeights().type; + if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF) + { + // Sparsify float only. + constO2L.insert({l->getOutput(0), l}); + } + break; + } + case nvinfer1::LayerType::kSHUFFLE: + { + shuffleI2L.insert({l->getInput(0), l}); + shuffleL2O.insert({l, l->getOutput(0)}); + break; + } + default: break; + } + }; + int32_t const nbLayers = network.getNbLayers(); + for (int32_t i = 0; i < nbLayers; ++i) + { + collectMappingInfo(i); + } + if (matmulI2L.size() == 0 || constO2L.size() == 0) + { + // No MatrixMultiply or Constant layer found, no weights to sparsify. + return; + } + + // Helper for analysis + auto isTranspose + = [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); }; + auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; }; + auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool + { + for (int32_t i = 0; i < dims.nbDims; ++i) + { + if (dims.d[i] != i || dims.d[i] != -1) + { + return false; + } + } + return true; + }; + auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) -> ITensor* + { + while (shuffleI2L.find(t) != shuffleI2L.end()) + { + nvinfer1::IShuffleLayer* s = static_cast(shuffleI2L.at(t)); + if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions()) + || !isIdenticalReshape(s->getReshapeDimensions())) + { + break; + } + + if (isTranspose(s->getFirstTranspose())) + { + needTranspose = !needTranspose; + } + if (isTranspose(s->getSecondTranspose())) + { + needTranspose = !needTranspose; + } + + t = shuffleL2O.at(s); + } + return t; + }; + + // 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose + std::unordered_map constantLayerToSparse; + for (auto& o2l : constO2L) + { + // If need to transpose the weights of the Constant layer. + // Need to transpose by default due to semantic difference. + bool needTranspose{true}; + ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose); + if (matmulI2L.find(t) == matmulI2L.end()) + { + continue; + } + + // check MatMul params... + IMatrixMultiplyLayer* mm = static_cast(matmulI2L.at(t)); + bool const twoInputs = mm->getNbInputs() == 2; + bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions()); + bool const isSimple = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE + && mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR; + if (!(twoInputs && all2D && isSimple)) + { + continue; + } + if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE) + { + needTranspose = !needTranspose; + } + + constantLayerToSparse.insert({static_cast(o2l.second), needTranspose}); + } + + // 3. Finally, sparsify the weights + auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose) + { + Dims dims = layer->getOutput(0)->getDimensions(); + ASSERT(dims.nbDims == 2); + int32_t const idxN = needTranspose ? 1 : 0; + int32_t const n = dims.d[idxN]; + int32_t const k = dims.d[1 - idxN]; + sparseWeights.emplace_back(); + std::vector& spw = sparseWeights.back(); + Weights w = layer->getWeights(); + DataType const dtype = w.type; + ASSERT(dtype == nvinfer1::DataType::kFLOAT + || dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored. + + if (needTranspose) + { + if (dtype == nvinfer1::DataType::kFLOAT) + { + spw.resize(w.count * sizeof(float)); + transpose2DWeights(spw.data(), w.values, k, n); + } + else if (dtype == nvinfer1::DataType::kHALF) + { + spw.resize(w.count * sizeof(half_float::half)); + transpose2DWeights(spw.data(), w.values, k, n); + } + + w.values = spw.data(); + std::vector tmpW; + sparsify(w, n, 1, tmpW); + + if (dtype == nvinfer1::DataType::kFLOAT) + { + transpose2DWeights(spw.data(), tmpW.data(), n, k); + } + else if (dtype == nvinfer1::DataType::kHALF) + { + transpose2DWeights(spw.data(), tmpW.data(), n, k); + } + } + else + { + sparsify(w, n, 1, spw); + } + + w.values = spw.data(); + layer->setWeights(w); + }; + for (auto& l : constantLayerToSparse) + { + sparsifyConstantWeights(l.first, l.second); + } +} + +template +void setSparseWeights(L& l, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + auto weights = l.getKernelWeights(); + sparsify(weights, k, trs, sparseWeights); + weights.values = sparseWeights.data(); + l.setKernelWeights(weights); +} + +// Explicit instantiation +template void setSparseWeights( + IConvolutionLayer& l, int32_t k, int32_t trs, std::vector& sparseWeights); + +void sparsify(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) +{ + for (int32_t l = 0; l < network.getNbLayers(); ++l) + { + auto* layer = network.getLayer(l); + auto const t = layer->getType(); + if (t == nvinfer1::LayerType::kCONVOLUTION) + { + auto& conv = *static_cast(layer); + auto const& dims = conv.getKernelSizeNd(); + ASSERT(dims.nbDims == 2 || dims.nbDims == 3); + auto const k = conv.getNbOutputMaps(); + auto const trs = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); + sparseWeights.emplace_back(); + setSparseWeights(conv, k, trs, sparseWeights.back()); + } + } + + sparsifyMatMulKernelWeights(network, sparseWeights); + sample::gLogVerbose << "--sparsity=force pruned " << sparseWeights.size() << " weights to be sparsity pattern." << std::endl; + sample::gLogVerbose << "--sparsity=force has been deprecated. Please use to rewrite the weights to a sparsity pattern and then run with --sparsity=enable" << std::endl; +} + +void sparsify(Weights const& weights, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + switch (weights.type) + { + case DataType::kFLOAT: + sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); + break; + case DataType::kHALF: + sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); + break; + case DataType::kBF16: + sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); + break; + case DataType::kINT8: + case DataType::kINT32: + case DataType::kUINT8: + case DataType::kBOOL: + case DataType::kINT4: + case DataType::kFP8: + case DataType::kINT64: + ASSERT(false && "Unsupported data type"); + } +} + +template +void print(std::ostream& os, T v) +{ + os << v; +} + +void print(std::ostream& os, int8_t v) +{ + os << static_cast(v); +} + +void print(std::ostream& os, __half v) +{ + os << static_cast(v); +} + +template +void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv) +{ + auto const vol = volume(dims); + T const* typedBuffer = static_cast(buffer); + std::string sep; + for (int64_t v = 0; v < vol; ++v) + { + int64_t curV = v; + int32_t dataOffset = 0; + for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) + { + int32_t dimVal = curV % dims.d[dimIndex]; + if (dimIndex == vectorDim) + { + dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; + } + else + { + dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); + } + curV /= dims.d[dimIndex]; + ASSERT(curV >= 0); + } + + os << sep; + sep = separator; + print(os, typedBuffer[dataOffset]); + } +} + +// Explicit instantiation +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer<__half>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); + +template +void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + auto const c = count / (k * trs); + sparseWeights.resize(count * sizeof(T)); + auto* sparseValues = reinterpret_cast(sparseWeights.data()); + + constexpr int32_t window = 4; + constexpr int32_t nonzeros = 2; + + int32_t const crs = c * trs; + auto const getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * trs + rsi; }; + + for (int64_t ki = 0; ki < k; ++ki) + { + for (int64_t rsi = 0; rsi < trs; ++rsi) + { + int32_t w = 0; + int32_t nz = 0; + for (int64_t ci = 0; ci < c; ++ci) + { + auto const index = getIndex(ki, ci, rsi); + if (nz < nonzeros) + { + sparseValues[index] = values[index]; + ++nz; + } + else + { + sparseValues[index] = 0; + } + if (++w == window) + { + w = 0; + nz = 0; + } + } + } + } +} + +// Explicit instantiation +template void sparsify( + float const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights); +template void sparsify( + half_float::half const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights); + +template +void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n) +{ + ASSERT(dst != src); + T* tdst = reinterpret_cast(dst); + T const* tsrc = reinterpret_cast(src); + for (int32_t mi = 0; mi < m; ++mi) + { + for (int32_t ni = 0; ni < n; ++ni) + { + int32_t const isrc = mi * n + ni; + int32_t const idst = ni * m + mi; + tdst[idst] = tsrc[isrc]; + } + } +} + +// Explicit instantiation +template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); +template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); + +template ::value, bool>::type> +void fillBuffer(void* buffer, int64_t volume, T min, T max) +{ + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_int_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} + +template ::value, int32_t>::type> +void fillBuffer(void* buffer, int64_t volume, T min, T max) +{ + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} + +// Explicit instantiation +template void fillBuffer(void* buffer, int64_t volume, bool min, bool max); +template void fillBuffer(void* buffer, int64_t volume, float min, float max); +template void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max); +template void fillBuffer(void* buffer, int64_t volume, int64_t min, int64_t max); +template void fillBuffer(void* buffer, int64_t volume, int8_t min, int8_t max); +template void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max); +template void fillBuffer(void* buffer, int64_t volume, BFloat16 min, BFloat16 max); +template void fillBuffer(void* buffer, int64_t volume, uint8_t min, uint8_t max); + +bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target) +{ + auto const splitPattern = splitToStringVec(pattern, '*', 1); + + // If there is no wildcard, return if the two strings match exactly. + if (splitPattern.size() == 1) + { + return pattern == target; + } + + // Otherwise, target must follow prefix+anything+postfix pattern. + return target.size() >= (splitPattern[0].size() + splitPattern[1].size()) && target.find(splitPattern[0]) == 0 + && target.rfind(splitPattern[1]) == (target.size() - splitPattern[1].size()); +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/sampleUtils.h b/src/Detector/tensorrt_yolo/common/sampleUtils.h index 1509a7fc..6cd4280b 100644 --- a/src/Detector/tensorrt_yolo/common/sampleUtils.h +++ b/src/Detector/tensorrt_yolo/common/sampleUtils.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -22,6 +23,7 @@ #include #include #include +#include #include #include @@ -32,24 +34,20 @@ #include "common.h" #include "logger.h" -#include "sampleDevice.h" -#include "sampleOptions.h" + +#define SMP_RETVAL_IF_FALSE(condition, msg, retval, err) \ + { \ + if ((condition) == false) \ + { \ + (err) << (msg) << std::endl; \ + return retval; \ + } \ + } namespace sample { -inline int dataTypeSize(nvinfer1::DataType dataType) -{ - switch (dataType) - { - case nvinfer1::DataType::kINT32: - case nvinfer1::DataType::kFLOAT: return 4; - case nvinfer1::DataType::kHALF: return 2; - case nvinfer1::DataType::kBOOL: - case nvinfer1::DataType::kINT8: return 1; - } - return 0; -} +size_t dataTypeSize(nvinfer1::DataType dataType); template inline T roundUp(T m, T n) @@ -57,485 +55,71 @@ inline T roundUp(T m, T n) return ((m + n - 1) / n) * n; } -inline int volume(const nvinfer1::Dims& d) -{ - return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); -} - //! comps is the number of components in a vector. Ignored if vecDim < 0. -inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch) -{ - int maxNbElems = 1; - for (int i = 0; i < dims.nbDims; ++i) - { - // Get effective length of axis. - int d = dims.d[i]; - // Any dimension is 0, it is an empty tensor. - if (d == 0) - { - return 0; - } - if (i == vecDim) - { - d = samplesCommon::divUp(d, comps); - } - maxNbElems = std::max(maxNbElems, d * strides.d[i]); - } - return static_cast(maxNbElems) * batch * (vecDim < 0 ? 1 : comps); -} +int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch); -inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch) -{ - if (vecDim != -1) - { - dims.d[vecDim] = roundUp(dims.d[vecDim], comps); - } - return volume(dims) * std::max(batch, 1); -} +using samplesCommon::volume; -inline nvinfer1::Dims toDims(const std::vector& vec) -{ - int limit = static_cast(nvinfer1::Dims::MAX_DIMS); - if (static_cast(vec.size()) > limit) - { - sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; - } - // Pick first nvinfer1::Dims::MAX_DIMS elements - nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; - std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); - return dims; -} +nvinfer1::Dims toDims(std::vector const& vec); -template -inline void fillBuffer(void* buffer, int64_t volume, T min, T max) -{ - T* typedBuffer = static_cast(buffer); - std::default_random_engine engine; - if (std::is_integral::value) - { - std::uniform_int_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); - } - else - { - std::uniform_real_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); - } -} +template ::value, bool>::type = true> +void fillBuffer(void* buffer, int64_t volume, T min, T max); -// Specialization needed for custom type __half -template -inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max) -{ - H* typedBuffer = static_cast(buffer); - std::default_random_engine engine; - std::uniform_real_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); -} -template <> -inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max) -{ - fillBufferHalf(buffer, volume, min, max); -} +template ::value, int32_t>::type = 0> +void fillBuffer(void* buffer, int64_t volume, T min, T max); template -inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const nvinfer1::Dims& dims, - const nvinfer1::Dims& strides, int32_t vectorDim, int32_t spv) -{ - const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); - const T* typedBuffer = static_cast(buffer); - std::string sep; - for (int64_t v = 0; v < volume; ++v) - { - int64_t curV = v; - int32_t dataOffset = 0; - for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) - { - int32_t dimVal = curV % dims.d[dimIndex]; - if (dimIndex == vectorDim) - { - dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; - } - else - { - dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); - } - curV /= dims.d[dimIndex]; - ASSERT(curV >= 0); - } - - os << sep << typedBuffer[dataOffset]; - sep = separator; - } -} - -inline void loadFromFile(std::string const& fileName, char* dst, size_t size) -{ - ASSERT(dst); - - std::ifstream file(fileName, std::ios::in | std::ios::binary); - if (file.is_open()) - { - file.read(dst, size); - file.close(); - } - else - { - std::stringstream msg; - msg << "Cannot open file " << fileName << "!"; - throw std::invalid_argument(msg.str()); - } -} - -struct Binding -{ - bool isInput{false}; - std::unique_ptr buffer; - int64_t volume{0}; - nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; - - void fill(const std::string& fileName) - { - loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); - } - - void fill() - { - switch (dataType) - { - case nvinfer1::DataType::kBOOL: - { - fillBuffer(buffer->getHostBuffer(), volume, 0, 1); - break; - } - case nvinfer1::DataType::kINT32: - { - fillBuffer(buffer->getHostBuffer(), volume, -128, 127); - break; - } - case nvinfer1::DataType::kINT8: - { - fillBuffer(buffer->getHostBuffer(), volume, -128, 127); - break; - } - case nvinfer1::DataType::kFLOAT: - { - fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); - break; - } - case nvinfer1::DataType::kHALF: - { - fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); - break; - } - } - } - - void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv, - const std::string separator = " ") const - { - switch (dataType) - { - case nvinfer1::DataType::kBOOL: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kINT32: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kINT8: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kFLOAT: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kHALF: - { - dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - } - } -}; - -class Bindings -{ -public: - Bindings() = delete; - explicit Bindings(bool useManaged) - : mUseManaged(useManaged) - { - } - - void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType, - const std::string& fileName = "") - { - while (mBindings.size() <= static_cast(b)) - { - mBindings.emplace_back(); - mDevicePointers.emplace_back(); - } - mNames[name] = b; - if (mBindings[b].buffer == nullptr) - { - if (mUseManaged) - mBindings[b].buffer.reset(new UnifiedMirroredBuffer); - else - mBindings[b].buffer.reset(new DiscreteMirroredBuffer); - } - mBindings[b].isInput = isInput; - // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr - // even for empty tensors, so allocate a dummy byte. - if (volume == 0) - mBindings[b].buffer->allocate(1); - else - mBindings[b].buffer->allocate(static_cast(volume) * static_cast(dataTypeSize(dataType))); - - mBindings[b].volume = volume; - mBindings[b].dataType = dataType; - mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); - if (isInput) - { - if (fileName.empty()) - fill(b); - else - fill(b, fileName); - } - } - - void** getDeviceBuffers() - { - return mDevicePointers.data(); - } - - void transferInputToDevice(TrtCudaStream& stream) - { - for (auto& b : mNames) - { - if (mBindings[b.second].isInput) - mBindings[b.second].buffer->hostToDevice(stream); - } - } - - void transferOutputToHost(TrtCudaStream& stream) - { - for (auto& b : mNames) - { - if (!mBindings[b.second].isInput) - mBindings[b.second].buffer->deviceToHost(stream); - } - } - - void fill(int binding, const std::string& fileName) - { - mBindings[binding].fill(fileName); - } - - void fill(int binding) - { - mBindings[binding].fill(); - } +void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, nvinfer1::Dims const& dims, + nvinfer1::Dims const& strides, int32_t vectorDim, int32_t spv); - void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - const auto dims = context.getBindingDimensions(binding); - // Do not add a newline terminator, because the caller may be outputting a JSON string. - os << dims; - } - - void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os, - const std::string& separator = " ", int32_t batch = 1) const - { - nvinfer1::Dims dims = context.getBindingDimensions(binding); - nvinfer1::Dims strides = context.getStrides(binding); - int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding); - const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding); +void loadFromFile(std::string const& fileName, char* dst, size_t size); - if (context.getEngine().hasImplicitBatchDimension()) - { - auto insertN = [](nvinfer1::Dims& d, int32_t bs) { - const int32_t nbDims = d.nbDims; - ASSERT(nbDims < nvinfer1::Dims::MAX_DIMS); - std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]); - d.d[0] = bs; - d.nbDims = nbDims + 1; - }; - int32_t batchStride = 0; - for (int32_t i = 0; i < strides.nbDims; ++i) - { - if (strides.d[i] * dims.d[i] > batchStride) - { - batchStride = strides.d[i] * dims.d[i]; - } - } - insertN(dims, batch); - insertN(strides, batchStride); - vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1; - } - - mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); - } - - void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto isInput = [](const Binding& b) { return b.isInput; }; - dumpBindings(context, isInput, os); - } +std::vector splitToStringVec(std::string const& option, char separator, int64_t maxSplit = -1); - void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto isOutput = [](const Binding& b) { return !b.isInput; }; - dumpBindings(context, isOutput, os); - } +bool broadcastIOFormats(std::vector const& formats, size_t nbBindings, bool isInput = true); - void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto all = [](const Binding& /*b*/) { return true; }; - dumpBindings(context, all, os); - } +int32_t getCudaDriverVersion(); - void dumpBindings( - const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const - { - for (const auto& n : mNames) - { - const auto binding = n.second; - if (predicate(mBindings[binding])) - { - os << n.first << ": ("; - dumpBindingDimensions(binding, context, os); - os << ")" << std::endl; +int32_t getCudaRuntimeVersion(); - dumpBindingValues(context, binding, os); - os << std::endl; - } - } - } +void sparsify(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights); +void sparsify(nvinfer1::Weights const& weights, int32_t k, int32_t rs, std::vector& sparseWeights); - std::unordered_map getInputBindings() const - { - auto isInput = [](const Binding& b) { return b.isInput; }; - return getBindings(isInput); - } - - std::unordered_map getOutputBindings() const - { - auto isOutput = [](const Binding& b) { return !b.isInput; }; - return getBindings(isOutput); - } - - std::unordered_map getBindings() const - { - auto all = [](const Binding& /*b*/) { return true; }; - return getBindings(all); - } +// Walk the weights elements and overwrite (at most) 2 out of 4 elements to 0. +template +void sparsify(T const* values, int64_t count, int32_t k, int32_t rs, std::vector& sparseWeights); - std::unordered_map getBindings(bool (*predicate)(const Binding& b)) const - { - std::unordered_map bindings; - for (const auto& n : mNames) - { - const auto binding = n.second; - if (predicate(mBindings[binding])) - bindings.insert(n); - } - return bindings; - } +template +void setSparseWeights(L& l, int32_t k, int32_t rs, std::vector& sparseWeights); -private: - std::unordered_map mNames; - std::vector mBindings; - std::vector mDevicePointers; - bool mUseManaged{false}; -}; +// Sparsify the weights of Constant layers that are fed to MatMul via Shuffle layers. +// Forward analysis on the API graph to determine which weights to sparsify. +void sparsifyMatMulKernelWeights( + nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights); template -struct TrtDestroyer -{ - void operator()(T* t) - { - //t->destroy(); - delete t; - } -}; +void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); -template -using TrtUniquePtr = std::unique_ptr>; +//! A helper function to match a target string with a pattern where the pattern can contain up to one wildcard ('*') +//! character that matches to any strings. +bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target); -inline bool broadcastIOFormats(const std::vector& formats, size_t nbBindings, bool isInput = true) -{ - bool broadcast = formats.size() == 1; - bool validFormatsCount = broadcast || (formats.size() == nbBindings); - if (!formats.empty() && !validFormatsCount) - { - if (isInput) - { - throw std::invalid_argument( - "The number of inputIOFormats must match network's inputs or be one for broadcasting."); - } - else - { - throw std::invalid_argument( - "The number of outputIOFormats must match network's outputs or be one for broadcasting."); - } - } - return broadcast; -} - -inline std::vector loadTimingCacheFile(const std::string inFileName) -{ - std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); - if (!iFile) - { - sample::gLogWarning << "Could not read timing cache from: " << inFileName - << ". A new timing cache will be generated and written." << std::endl; - return std::vector(); - } - iFile.seekg(0, std::ifstream::end); - size_t fsize = iFile.tellg(); - iFile.seekg(0, std::ifstream::beg); - std::vector content(fsize); - iFile.read(content.data(), fsize); - iFile.close(); - sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl; - return content; -} - -inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) +//! A helper method to find an item from an unordered_map. If the exact match exists, this is identical to +//! map.find(target). If the exact match does not exist, it returns the first plausible match, taking up to one wildcard +//! into account. If there is no plausible match, then it returns map.end(). +template +typename std::unordered_map::const_iterator findPlausible( + std::unordered_map const& map, std::string const& target) { - std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); - if (!oFile) + auto res = map.find(target); + if (res == map.end()) { - sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl; - return; + res = std::find_if( + map.begin(), map.end(), [&](typename std::unordered_map::value_type const& item) { + return matchStringWithOneWildcard(item.first, target); + }); } - oFile.write((char*) blob->data(), blob->size()); - oFile.close(); - sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl; -} - -inline int32_t getCudaDriverVersion() -{ - int32_t version{-1}; - cudaCheck(cudaDriverGetVersion(&version)); - return version; -} - -inline int32_t getCudaRuntimeVersion() -{ - int32_t version{-1}; - cudaCheck(cudaRuntimeGetVersion(&version)); - return version; + return res; } } // namespace sample diff --git a/src/Detector/tensorrt_yolo/common/streamReader.h b/src/Detector/tensorrt_yolo/common/streamReader.h new file mode 100644 index 00000000..7d4aa1c6 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/streamReader.h @@ -0,0 +1,78 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STREAM_READER_H +#define STREAM_READER_H + +#include "NvInferRuntime.h" +#include "sampleUtils.h" +#include + +namespace samplesCommon +{ + +//! Implements the TensorRT IStreamReader to allow deserializing an engine directly from the plan file. +class FileStreamReader final : public nvinfer1::IStreamReader +{ +public: + bool open(std::string filepath) + { + mFile.open(filepath, std::ios::binary); + return mFile.is_open(); + } + + void close() + { + if (mFile.is_open()) + { + mFile.close(); + } + } + + ~FileStreamReader() final + { + close(); + } + + int64_t read(void* dest, int64_t bytes) final + { + if (!mFile.good()) + { + return -1; + } + mFile.read(static_cast(dest), bytes); + return mFile.gcount(); + } + + void reset() + { + assert(mFile.good()); + mFile.seekg(0); + } + + bool isOpen() const + { + return mFile.is_open(); + } + +private: + std::ifstream mFile; +}; + +} // namespace samplesCommon + +#endif // STREAM_READER_H diff --git a/src/Detector/tensorrt_yolo/common/timingCache.cpp b/src/Detector/tensorrt_yolo/common/timingCache.cpp new file mode 100644 index 00000000..18e85ba4 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/timingCache.cpp @@ -0,0 +1,157 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "timingCache.h" +#include "NvInfer.h" +#include "fileLock.h" +#include "sampleUtils.h" +#include +#include +#include +#include +#include +#include +using namespace nvinfer1; +namespace nvinfer1 +{ +namespace utils +{ +std::vector loadTimingCacheFile(ILogger& logger, std::string const& inFileName) +{ + try + { + std::unique_ptr fileLock{new FileLock(logger, inFileName)}; + std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); + if (!iFile) + { + std::stringstream ss; + ss << "Could not read timing cache from: " << inFileName + << ". A new timing cache will be generated and written."; + logger.log(ILogger::Severity::kWARNING, ss.str().c_str()); + return std::vector(); + } + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + std::stringstream ss; + ss << "Loaded " << fsize << " bytes of timing cache from " << inFileName; + logger.log(ILogger::Severity::kINFO, ss.str().c_str()); + return content; + } + catch (std::exception const& e) + { + std::cerr << "Exception detected: " << e.what() << std::endl; + } + return {}; +} + +std::unique_ptr buildTimingCacheFromFile( + ILogger& logger, IBuilderConfig& config, std::string const& timingCacheFile, std::ostream& err) +{ + std::unique_ptr timingCache{}; + auto timingCacheContents = loadTimingCacheFile(logger, timingCacheFile); + timingCache.reset(config.createTimingCache(timingCacheContents.data(), timingCacheContents.size())); + SMP_RETVAL_IF_FALSE(timingCache != nullptr, "TimingCache creation failed", nullptr, err); + config.clearFlag(BuilderFlag::kDISABLE_TIMING_CACHE); + SMP_RETVAL_IF_FALSE( + config.setTimingCache(*timingCache, true), "IBuilderConfig setTimingCache failed", nullptr, err); + return timingCache; +} + +void saveTimingCacheFile(ILogger& logger, std::string const& outFileName, IHostMemory const* blob) +{ + try + { + std::unique_ptr fileLock{new FileLock(logger, outFileName)}; + std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); + if (!oFile) + { + std::stringstream ss; + ss << "Could not write timing cache to: " << outFileName; + logger.log(ILogger::Severity::kWARNING, ss.str().c_str()); + return; + } + oFile.write(reinterpret_cast(blob->data()), blob->size()); + oFile.close(); + std::stringstream ss; + ss << "Saved " << blob->size() << " bytes of timing cache to " << outFileName; + logger.log(ILogger::Severity::kINFO, ss.str().c_str()); + } + catch (std::exception const& e) + { + std::cerr << "Exception detected: " << e.what() << std::endl; + } +} + +void updateTimingCacheFile(nvinfer1::ILogger& logger, std::string const& fileName, + nvinfer1::ITimingCache const* timingCache, nvinfer1::IBuilder& builder) +{ + try + { + // Prepare empty timingCache in case that there is no existing file to read + std::unique_ptr config{builder.createBuilderConfig()}; + std::unique_ptr fileTimingCache{config->createTimingCache(static_cast(nullptr), 0)}; + + std::unique_ptr fileLock{new FileLock(logger, fileName)}; + std::ifstream iFile(fileName, std::ios::in | std::ios::binary); + if (iFile) + { + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + std::stringstream ss; + ss << "Loaded " << fsize << " bytes of timing cache from " << fileName; + logger.log(ILogger::Severity::kINFO, ss.str().c_str()); + fileTimingCache.reset(config->createTimingCache(static_cast(content.data()), content.size())); + if (!fileTimingCache) + { + throw std::runtime_error("Failed to create timingCache from " + fileName + "!"); + } + } + fileTimingCache->combine(*timingCache, false); + std::unique_ptr blob{fileTimingCache->serialize()}; + if (!blob) + { + throw std::runtime_error("Failed to serialize ITimingCache!"); + } + std::ofstream oFile(fileName, std::ios::out | std::ios::binary); + if (!oFile) + { + std::stringstream ss; + ss << "Could not write timing cache to: " << fileName; + logger.log(ILogger::Severity::kWARNING, ss.str().c_str()); + return; + } + oFile.write(reinterpret_cast(blob->data()), blob->size()); + oFile.close(); + std::stringstream ss; + ss << "Saved " << blob->size() << " bytes of timing cache to " << fileName; + logger.log(ILogger::Severity::kINFO, ss.str().c_str()); + } + catch (std::exception const& e) + { + std::cerr << "Exception detected: " << e.what() << std::endl; + } +} +} // namespace utils +} // namespace nvinfer1 diff --git a/src/Detector/tensorrt_yolo/common/timingCache.h b/src/Detector/tensorrt_yolo/common/timingCache.h new file mode 100644 index 00000000..c4c76e37 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common/timingCache.h @@ -0,0 +1,38 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_ +#define TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_ +#include "NvInfer.h" +#include +#include +#include +#include + +namespace nvinfer1 +{ +namespace utils +{ +std::vector loadTimingCacheFile(nvinfer1::ILogger& logger, std::string const& inFileName); +std::unique_ptr buildTimingCacheFromFile( + ILogger& logger, IBuilderConfig& config, std::string const& timingCacheFile, std::ostream& err); +void saveTimingCacheFile(nvinfer1::ILogger& logger, std::string const& outFileName, nvinfer1::IHostMemory const* blob); +void updateTimingCacheFile(nvinfer1::ILogger& logger, std::string const& fileName, + nvinfer1::ITimingCache const* timingCache, nvinfer1::IBuilder& builder); +} // namespace utils +} // namespace nvinfer1 + +#endif // TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_ diff --git a/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h b/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h new file mode 100644 index 00000000..9eaac768 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h @@ -0,0 +1,388 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef BATCH_STREAM_H +#define BATCH_STREAM_H + +#include "NvInfer.h" +#include "common.h" +#include +#include +#include + +class IBatchStream +{ +public: + virtual void reset(int firstBatch) = 0; + virtual bool next() = 0; + virtual void skip(int skipCount) = 0; + virtual float* getBatch() = 0; + virtual float* getLabels() = 0; + virtual int getBatchesRead() const = 0; + virtual int getBatchSize() const = 0; + virtual nvinfer1::Dims getDims() const = 0; +}; + +class MNISTBatchStream : public IBatchStream +{ +public: + MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, const std::string& labelsFile, + const std::vector& directories) + : mBatchSize{batchSize} + , mMaxBatches{maxBatches} + , mDims{3, {1, 28, 28}} //!< We already know the dimensions of MNIST images. + { + readDataFile(locateFile(dataFile, directories)); + readLabelsFile(locateFile(labelsFile, directories)); + } + + void reset(int firstBatch) override + { + mBatchCount = firstBatch; + } + + bool next() override + { + if (mBatchCount >= mMaxBatches) + { + return false; + } + ++mBatchCount; + return true; + } + + void skip(int skipCount) override + { + mBatchCount += skipCount; + } + + float* getBatch() override + { + return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims)); + } + + float* getLabels() override + { + return mLabels.data() + (mBatchCount * mBatchSize); + } + + int getBatchesRead() const override + { + return mBatchCount; + } + + int getBatchSize() const override + { + return mBatchSize; + } + + nvinfer1::Dims getDims() const override + { + return nvinfer1::Dims{4, {mBatchSize, mDims.d[0], mDims.d[1], mDims.d[2]}}; + } + +private: + void readDataFile(const std::string& dataFilePath) + { + std::ifstream file{dataFilePath.c_str(), std::ios::binary}; + + int magicNumber, numImages, imageH, imageW; + file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); + // All values in the MNIST files are big endian. + magicNumber = samplesCommon::swapEndianness(magicNumber); + ASSERT(magicNumber == 2051 && "Magic Number does not match the expected value for an MNIST image set"); + + // Read number of images and dimensions + file.read(reinterpret_cast(&numImages), sizeof(numImages)); + file.read(reinterpret_cast(&imageH), sizeof(imageH)); + file.read(reinterpret_cast(&imageW), sizeof(imageW)); + + numImages = samplesCommon::swapEndianness(numImages); + imageH = samplesCommon::swapEndianness(imageH); + imageW = samplesCommon::swapEndianness(imageW); + + // The MNIST data is made up of unsigned bytes, so we need to cast to float and normalize. + int numElements = numImages * imageH * imageW; + std::vector rawData(numElements); + file.read(reinterpret_cast(rawData.data()), numElements * sizeof(uint8_t)); + mData.resize(numElements); + std::transform( + rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast(val) / 255.f; }); + } + + void readLabelsFile(const std::string& labelsFilePath) + { + std::ifstream file{labelsFilePath.c_str(), std::ios::binary}; + int magicNumber, numImages; + file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); + // All values in the MNIST files are big endian. + magicNumber = samplesCommon::swapEndianness(magicNumber); + ASSERT(magicNumber == 2049 && "Magic Number does not match the expected value for an MNIST labels file"); + + file.read(reinterpret_cast(&numImages), sizeof(numImages)); + numImages = samplesCommon::swapEndianness(numImages); + + std::vector rawLabels(numImages); + file.read(reinterpret_cast(rawLabels.data()), numImages * sizeof(uint8_t)); + mLabels.resize(numImages); + std::transform( + rawLabels.begin(), rawLabels.end(), mLabels.begin(), [](uint8_t val) { return static_cast(val); }); + } + + int mBatchSize{0}; + int mBatchCount{0}; //!< The batch that will be read on the next invocation of next() + int mMaxBatches{0}; + nvinfer1::Dims mDims{}; + std::vector mData{}; + std::vector mLabels{}; +}; + +class BatchStream : public IBatchStream +{ +public: + BatchStream( + int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector directories) + : mBatchSize(batchSize) + , mMaxBatches(maxBatches) + , mPrefix(prefix) + , mSuffix(suffix) + , mDataDir(directories) + { + FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb"); + ASSERT(file != nullptr); + int d[4]; + size_t readSize = fread(d, sizeof(int), 4, file); + ASSERT(readSize == 4); + mDims.nbDims = 4; // The number of dimensions. + mDims.d[0] = d[0]; // Batch Size + mDims.d[1] = d[1]; // Channels + mDims.d[2] = d[2]; // Height + mDims.d[3] = d[3]; // Width + ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0); + fclose(file); + + mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; + mBatch.resize(mBatchSize * mImageSize, 0); + mLabels.resize(mBatchSize, 0); + mFileBatch.resize(mDims.d[0] * mImageSize, 0); + mFileLabels.resize(mDims.d[0], 0); + reset(0); + } + + BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector directories) + : BatchStream(batchSize, maxBatches, prefix, ".batch", directories) + { + } + + BatchStream( + int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector directories) + : mBatchSize(batchSize) + , mMaxBatches(maxBatches) + , mDims(dims) + , mListFile(listFile) + , mDataDir(directories) + { + mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; + mBatch.resize(mBatchSize * mImageSize, 0); + mLabels.resize(mBatchSize, 0); + mFileBatch.resize(mDims.d[0] * mImageSize, 0); + mFileLabels.resize(mDims.d[0], 0); + reset(0); + } + + // Resets data members + void reset(int firstBatch) override + { + mBatchCount = 0; + mFileCount = 0; + mFileBatchPos = mDims.d[0]; + skip(firstBatch); + } + + // Advance to next batch and return true, or return false if there is no batch left. + bool next() override + { + if (mBatchCount == mMaxBatches) + { + return false; + } + + for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize) + { + ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]); + if (mFileBatchPos == mDims.d[0] && !update()) + { + return false; + } + + // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer. + csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); + std::copy_n( + getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize); + std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos); + } + mBatchCount++; + return true; + } + + // Skips the batches + void skip(int skipCount) override + { + if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && mFileBatchPos == mDims.d[0]) + { + mFileCount += skipCount * mBatchSize / mDims.d[0]; + return; + } + + int x = mBatchCount; + for (int i = 0; i < skipCount; i++) + { + next(); + } + mBatchCount = x; + } + + float* getBatch() override + { + return mBatch.data(); + } + + float* getLabels() override + { + return mLabels.data(); + } + + int getBatchesRead() const override + { + return mBatchCount; + } + + int getBatchSize() const override + { + return mBatchSize; + } + + nvinfer1::Dims getDims() const override + { + return mDims; + } + +private: + float* getFileBatch() + { + return mFileBatch.data(); + } + + float* getFileLabels() + { + return mFileLabels.data(); + } + + bool update() + { + if (mListFile.empty()) + { + std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir); + FILE* file = fopen(inputFileName.c_str(), "rb"); + if (!file) + { + return false; + } + + int d[4]; + size_t readSize = fread(d, sizeof(int), 4, file); + ASSERT(readSize == 4); + ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]); + size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file); + ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize)); + size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file); + ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0])); + + fclose(file); + } + else + { + std::vector fNames; + std::ifstream file(locateFile(mListFile, mDataDir), std::ios::binary); + if (!file) + { + return false; + } + + sample::gLogInfo << "Batch #" << mFileCount << std::endl; + file.seekg(((mBatchCount * mBatchSize)) * 7); + + for (int i = 1; i <= mBatchSize; i++) + { + std::string sName; + std::getline(file, sName); + sName = sName + ".ppm"; + sample::gLogInfo << "Calibrating with file " << sName << std::endl; + fNames.emplace_back(sName); + } + + mFileCount++; + + const int imageC = 3; + const int imageH = 300; + const int imageW = 300; + std::vector> ppms(fNames.size()); + for (uint32_t i = 0; i < fNames.size(); ++i) + { + readPPMFile(locateFile(fNames[i], mDataDir), ppms[i]); + } + + std::vector data(samplesCommon::volume(mDims)); + const float scale = 2.0 / 255.0; + const float bias = 1.0; + long int volChl = mDims.d[2] * mDims.d[3]; + + // Normalize input data + for (int i = 0, volImg = mDims.d[1] * mDims.d[2] * mDims.d[3]; i < mBatchSize; ++i) + { + for (int c = 0; c < mDims.d[1]; ++c) + { + for (int j = 0; j < volChl; ++j) + { + data[i * volImg + c * volChl + j] = scale * float(ppms[i].buffer[j * mDims.d[1] + c]) - bias; + } + } + } + + std::copy_n(data.data(), mDims.d[0] * mImageSize, getFileBatch()); + } + + mFileBatchPos = 0; + return true; + } + + int mBatchSize{0}; + int mMaxBatches{0}; + int mBatchCount{0}; + int mFileCount{0}; + int mFileBatchPos{0}; + int mImageSize{0}; + std::vector mBatch; //!< Data for the batch + std::vector mLabels; //!< Labels for the batch + std::vector mFileBatch; //!< List of image files + std::vector mFileLabels; //!< List of label files + std::string mPrefix; //!< Batch file name prefix + std::string mSuffix; //!< Batch file name suffix + nvinfer1::Dims mDims; //!< Input dimensions + std::string mListFile; //!< File name of the list of image names + std::vector mDataDir; //!< Directories where the files can be found +}; + +#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h b/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h new file mode 100644 index 00000000..f31789bf --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ENTROPY_CALIBRATOR_H +#define ENTROPY_CALIBRATOR_H + +#include "BatchStream.h" +#include "NvInfer.h" + +//! \class EntropyCalibratorImpl +//! +//! \brief Implements common functionality for Entropy calibrators. +//! +template +class EntropyCalibratorImpl +{ +public: + EntropyCalibratorImpl( + TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true) + : mStream{stream} + , mCalibrationTableName("CalibrationTable" + networkName) + , mInputBlobName(inputBlobName) + , mReadCache(readCache) + { + nvinfer1::Dims dims = mStream.getDims(); + mInputCount = samplesCommon::volume(dims); + CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float))); + mStream.reset(firstBatch); + } + + virtual ~EntropyCalibratorImpl() + { + CHECK(cudaFree(mDeviceInput)); + } + + int getBatchSize() const noexcept + { + return mStream.getBatchSize(); + } + + bool getBatch(void* bindings[], const char* names[], int /*nbBindings*/) noexcept + { + if (!mStream.next()) + return false; + + CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice)); + ASSERT(!strcmp(names[0], mInputBlobName)); + bindings[0] = mDeviceInput; + return true; + } + + const void* readCalibrationCache(size_t& length) noexcept + { + mCalibrationCache.clear(); + std::ifstream input(mCalibrationTableName, std::ios::binary); + input >> std::noskipws; + if (mReadCache && input.good()) + { + std::copy(std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(mCalibrationCache)); + } + length = mCalibrationCache.size(); + return length ? mCalibrationCache.data() : nullptr; + } + + void writeCalibrationCache(const void* cache, size_t length) noexcept + { + std::ofstream output(mCalibrationTableName, std::ios::binary); + output.write(reinterpret_cast(cache), length); + } + +private: + TBatchStream mStream; + size_t mInputCount; + std::string mCalibrationTableName; + const char* mInputBlobName; + bool mReadCache{true}; + void* mDeviceInput{nullptr}; + std::vector mCalibrationCache; +}; + +//! \class Int8EntropyCalibrator2 +//! +//! \brief Implements Entropy calibrator 2. +//! CalibrationAlgoType is kENTROPY_CALIBRATION_2. +//! +template +class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + Int8EntropyCalibrator2( + TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true) + : mImpl(stream, firstBatch, networkName, inputBlobName, readCache) + { + } + + int getBatchSize() const noexcept override + { + return mImpl.getBatchSize(); + } + + bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override + { + return mImpl.getBatch(bindings, names, nbBindings); + } + + const void* readCalibrationCache(size_t& length) noexcept override + { + return mImpl.readCalibrationCache(length); + } + + void writeCalibrationCache(const void* cache, size_t length) noexcept override + { + mImpl.writeCalibrationCache(cache, length); + } + +private: + EntropyCalibratorImpl mImpl; +}; + +#endif // ENTROPY_CALIBRATOR_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h b/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h new file mode 100644 index 00000000..40b35fb5 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ERROR_RECORDER_H +#define ERROR_RECORDER_H +#include "NvInferRuntimeCommon.h" +#include "logger.h" +#include +#include +#include +#include +#include + +using nvinfer1::IErrorRecorder; +using nvinfer1::ErrorCode; + +//! +//! A simple implementation of the IErrorRecorder interface for +//! use by samples. This interface also can be used as a reference +//! implementation. +//! The sample Error recorder is based on a vector that pairs the error +//! code and the error string into a single element. It also uses +//! standard mutex's and atomics in order to make sure that the code +//! works in a multi-threaded environment. +//! +class SampleErrorRecorder : public IErrorRecorder +{ + using errorPair = std::pair; + using errorStack = std::vector; + +public: + SampleErrorRecorder() = default; + + virtual ~SampleErrorRecorder() noexcept {} + int32_t getNbErrors() const noexcept final + { + return mErrorStack.size(); + } + ErrorCode getErrorCode(int32_t errorIdx) const noexcept final + { + return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first; + }; + IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final + { + return invalidIndexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str(); + } + // This class can never overflow since we have dynamic resize via std::vector usage. + bool hasOverflowed() const noexcept final + { + return false; + } + + // Empty the errorStack. + void clear() noexcept final + { + try + { + // grab a lock so that there is no addition while clearing. + std::lock_guard guard(mStackLock); + mErrorStack.clear(); + } + catch (const std::exception& e) + { + sample::gLogFatal << "Internal Error: " << e.what() << std::endl; + } + }; + + //! Simple helper function that + bool empty() const noexcept + { + return mErrorStack.empty(); + } + + bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final + { + try + { + std::lock_guard guard(mStackLock); + sample::gLogError << "Error[" << static_cast(val) << "]: " << desc << std::endl; + mErrorStack.push_back(errorPair(val, desc)); + } + catch (const std::exception& e) + { + sample::gLogFatal << "Internal Error: " << e.what() << std::endl; + } + // All errors are considered fatal. + return true; + } + + // Atomically increment or decrement the ref counter. + IErrorRecorder::RefCount incRefCount() noexcept final + { + return ++mRefCount; + } + IErrorRecorder::RefCount decRefCount() noexcept final + { + return --mRefCount; + } + +private: + // Simple helper functions. + const errorPair& operator[](size_t index) const noexcept + { + return mErrorStack[index]; + } + + bool invalidIndexCheck(int32_t index) const noexcept + { + // By converting signed to unsigned, we only need a single check since + // negative numbers turn into large positive greater than the size. + size_t sIndex = index; + return sIndex >= mErrorStack.size(); + } + // Mutex to hold when locking mErrorStack. + std::mutex mStackLock; + + // Reference count of the class. Destruction of the class when mRefCount + // is not zero causes undefined behavior. + std::atomic mRefCount{0}; + + // The error stack that holds the errors recorded by TensorRT. + errorStack mErrorStack; +}; // class SampleErrorRecorder +#endif // ERROR_RECORDER_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/buffers.h b/src/Detector/tensorrt_yolo/common_deprecated/buffers.h new file mode 100644 index 00000000..ef673b2b --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/buffers.h @@ -0,0 +1,478 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_BUFFERS_H +#define TENSORRT_BUFFERS_H + +#include "NvInfer.h" +#include "common.h" +#include "half.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace samplesCommon +{ + +//! +//! \brief The GenericBuffer class is a templated class for buffers. +//! +//! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation, +//! deallocation, querying of buffers on both the device and the host. +//! It can handle data of arbitrary types because it stores byte buffers. +//! The template parameters AllocFunc and FreeFunc are used for the +//! allocation and deallocation of the buffer. +//! AllocFunc must be a functor that takes in (void** ptr, size_t size) +//! and returns bool. ptr is a pointer to where the allocated buffer address should be stored. +//! size is the amount of memory in bytes to allocate. +//! The boolean indicates whether or not the memory allocation was successful. +//! FreeFunc must be a functor that takes in (void* ptr) and returns void. +//! ptr is the allocated buffer address. It must work with nullptr input. +//! +template +class GenericBuffer +{ +public: + //! + //! \brief Construct an empty buffer. + //! + GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT) + : mSize(0) + , mCapacity(0) + , mType(type) + , mBuffer(nullptr) + { + } + + //! + //! \brief Construct a buffer with the specified allocation size in bytes. + //! + GenericBuffer(size_t size, nvinfer1::DataType type) + : mSize(size) + , mCapacity(size) + , mType(type) + { + if (!allocFn(&mBuffer, this->nbBytes())) + { + throw std::bad_alloc(); + } + } + + GenericBuffer(GenericBuffer&& buf) + : mSize(buf.mSize) + , mCapacity(buf.mCapacity) + , mType(buf.mType) + , mBuffer(buf.mBuffer) + { + buf.mSize = 0; + buf.mCapacity = 0; + buf.mType = nvinfer1::DataType::kFLOAT; + buf.mBuffer = nullptr; + } + + GenericBuffer& operator=(GenericBuffer&& buf) + { + if (this != &buf) + { + freeFn(mBuffer); + mSize = buf.mSize; + mCapacity = buf.mCapacity; + mType = buf.mType; + mBuffer = buf.mBuffer; + // Reset buf. + buf.mSize = 0; + buf.mCapacity = 0; + buf.mBuffer = nullptr; + } + return *this; + } + + //! + //! \brief Returns pointer to underlying array. + //! + void* data() + { + return mBuffer; + } + + //! + //! \brief Returns pointer to underlying array. + //! + const void* data() const + { + return mBuffer; + } + + //! + //! \brief Returns the size (in number of elements) of the buffer. + //! + size_t size() const + { + return mSize; + } + + //! + //! \brief Returns the size (in bytes) of the buffer. + //! + size_t nbBytes() const + { + return this->size() * samplesCommon::getElementSize(mType); + } + + //! + //! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity. + //! + void resize(size_t newSize) + { + mSize = newSize; + if (mCapacity < newSize) + { + freeFn(mBuffer); + if (!allocFn(&mBuffer, this->nbBytes())) + { + throw std::bad_alloc{}; + } + mCapacity = newSize; + } + } + + //! + //! \brief Overload of resize that accepts Dims + //! + void resize(const nvinfer1::Dims& dims) + { + return this->resize(samplesCommon::volume(dims)); + } + + ~GenericBuffer() + { + freeFn(mBuffer); + } + +private: + size_t mSize{0}, mCapacity{0}; + nvinfer1::DataType mType; + void* mBuffer; + AllocFunc allocFn; + FreeFunc freeFn; +}; + +class DeviceAllocator +{ +public: + bool operator()(void** ptr, size_t size) const + { + return cudaMalloc(ptr, size) == cudaSuccess; + } +}; + +class DeviceFree +{ +public: + void operator()(void* ptr) const + { + cudaFree(ptr); + } +}; + +class HostAllocator +{ +public: + bool operator()(void** ptr, size_t size) const + { + *ptr = malloc(size); + return *ptr != nullptr; + } +}; + +class HostFree +{ +public: + void operator()(void* ptr) const + { + free(ptr); + } +}; + +using DeviceBuffer = GenericBuffer; +using HostBuffer = GenericBuffer; + +//! +//! \brief The ManagedBuffer class groups together a pair of corresponding device and host buffers. +//! +class ManagedBuffer +{ +public: + DeviceBuffer deviceBuffer; + HostBuffer hostBuffer; +}; + +//! +//! \brief The BufferManager class handles host and device buffer allocation and deallocation. +//! +//! \details This RAII class handles host and device buffer allocation and deallocation, +//! memcpy between host and device buffers to aid with inference, +//! and debugging dumps to validate inference. The BufferManager class is meant to be +//! used to simplify buffer management and any interactions between buffers and the engine. +//! +class BufferManager +{ +public: + static const size_t kINVALID_SIZE_VALUE = ~size_t(0); + + //! + //! \brief Create a BufferManager for handling buffer interactions with engine. + //! + BufferManager(std::shared_ptr engine, const int batchSize, + const nvinfer1::IExecutionContext* context = nullptr) + : mEngine(engine) + , mBatchSize(batchSize) + { + // Full Dims implies no batch size. + auto impbs = engine->hasImplicitBatchDimension(); + std::cout << "hasImplicitBatchDimension: " << impbs << ", mBatchSize = " << mBatchSize << std::endl; + assert(engine->hasImplicitBatchDimension() || mBatchSize == 0); + // Create host and device buffers + for (int i = 0; i < mEngine->getNbBindings(); i++) + { + auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i); + size_t vol = context || !mBatchSize ? 1 : static_cast(mBatchSize); + nvinfer1::DataType type = mEngine->getBindingDataType(i); + int vecDim = mEngine->getBindingVectorizedDim(i); + if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector + { + int scalarsPerVec = mEngine->getBindingComponentsPerElement(i); + dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec); + vol *= scalarsPerVec; + } + vol *= samplesCommon::volume(dims); + std::unique_ptr manBuf{new ManagedBuffer()}; + manBuf->deviceBuffer = DeviceBuffer(vol, type); + manBuf->hostBuffer = HostBuffer(vol, type); + mDeviceBindings.emplace_back(manBuf->deviceBuffer.data()); + mManagedBuffers.emplace_back(std::move(manBuf)); + } + } + + //! + //! \brief Returns a vector of device buffers that you can use directly as + //! bindings for the execute and enqueue methods of IExecutionContext. + //! + std::vector& getDeviceBindings() + { + return mDeviceBindings; + } + + //! + //! \brief Returns a vector of device buffers. + //! + const std::vector& getDeviceBindings() const + { + return mDeviceBindings; + } + + //! + //! \brief Returns the device buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getDeviceBuffer(const std::string& tensorName) const + { + return getBuffer(false, tensorName); + } + + //! + //! \brief Returns the host buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getHostBuffer(const std::string& tensorName) const + { + return getBuffer(true, tensorName); + } + + //! + //! \brief Returns the host buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getHostBuffer(int bindingIndex) const + { + return getBuffer(true, bindingIndex); + } + + //! + //! \brief Returns the size of the host and device buffers that correspond to tensorName. + //! Returns kINVALID_SIZE_VALUE if no such tensor can be found. + //! + size_t size(const std::string& tensorName) const + { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + return kINVALID_SIZE_VALUE; + return mManagedBuffers[index]->hostBuffer.nbBytes(); + } + + //! + //! \brief Dump host buffer with specified tensorName to ostream. + //! Prints error message to std::ostream if no such tensor can be found. + //! + void dumpBuffer(std::ostream& os, const std::string& tensorName) + { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + { + os << "Invalid tensor name" << std::endl; + return; + } + void* buf = mManagedBuffers[index]->hostBuffer.data(); + size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes(); + nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index); + size_t rowCount = static_cast(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize); + int leadDim = mBatchSize; + int* trailDims = bufDims.d; + int nbDims = bufDims.nbDims; + + // Fix explicit Dimension networks + if (!leadDim && nbDims > 0) + { + leadDim = bufDims.d[0]; + ++trailDims; + --nbDims; + } + + os << "[" << leadDim; + for (int i = 0; i < nbDims; i++) + os << ", " << trailDims[i]; + os << "]" << std::endl; + switch (mEngine->getBindingDataType(index)) + { + case nvinfer1::DataType::kINT32: print(os, buf, bufSize, rowCount); break; + case nvinfer1::DataType::kFLOAT: print(os, buf, bufSize, rowCount); break; + case nvinfer1::DataType::kHALF: print(os, buf, bufSize, rowCount); break; + case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break; + case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break; + } + } + + //! + //! \brief Templated print function that dumps buffers of arbitrary type to std::ostream. + //! rowCount parameter controls how many elements are on each line. + //! A rowCount of 1 means that there is only 1 element on each line. + //! + template + void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount) + { + assert(rowCount != 0); + assert(bufSize % sizeof(T) == 0); + T* typedBuf = static_cast(buf); + size_t numItems = bufSize / sizeof(T); + for (int i = 0; i < static_cast(numItems); i++) + { + // Handle rowCount == 1 case + if (rowCount == 1 && i != static_cast(numItems) - 1) + os << typedBuf[i] << std::endl; + else if (rowCount == 1) + os << typedBuf[i]; + // Handle rowCount > 1 case + else if (i % rowCount == 0) + os << typedBuf[i]; + else if (i % rowCount == rowCount - 1) + os << " " << typedBuf[i] << std::endl; + else + os << " " << typedBuf[i]; + } + } + + //! + //! \brief Copy the contents of input host buffers to input device buffers synchronously. + //! + void copyInputToDevice() + { + memcpyBuffers(true, false, false, 0); + } + + //! + //! \brief Copy the contents of output device buffers to output host buffers synchronously. + //! + void copyOutputToHost() + { + memcpyBuffers(false, true, false, 0); + } + + //! + //! \brief Copy the contents of input host buffers to input device buffers asynchronously. + //! + void copyInputToDeviceAsync(const cudaStream_t& stream) + { + memcpyBuffers(true, false, true, stream); + } + + //! + //! \brief Copy the contents of output device buffers to output host buffers asynchronously. + //! + void copyOutputToHostAsync(const cudaStream_t& stream) + { + memcpyBuffers(false, true, true, stream); + } + + ~BufferManager() = default; + +private: + void* getBuffer(const bool isHost, const std::string& tensorName) const + { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + return nullptr; + return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data()); + } + + void* getBuffer(const bool isHost, int bindingIndex) const + { + if (bindingIndex == -1) + return nullptr; + return (isHost ? mManagedBuffers[bindingIndex]->hostBuffer.data() : mManagedBuffers[bindingIndex]->deviceBuffer.data()); + } + + void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream) + { + for (int i = 0; i < mEngine->getNbBindings(); i++) + { + void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data(); + const void* srcPtr = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data(); + const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes(); + const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice; + if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i))) + { + if (async) + CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream)); + else + CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType)); + } + } + } + + std::shared_ptr mEngine; //!< The pointer to the engine + int mBatchSize = 0; //!< The batch size for legacy networks, 0 otherwise. + std::vector> mManagedBuffers; //!< The vector of pointers to managed buffers + std::vector mDeviceBindings; //!< The vector of device buffers needed for engine execution +}; + +} // namespace samplesCommon + +#endif // TENSORRT_BUFFERS_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/common.h b/src/Detector/tensorrt_yolo/common_deprecated/common.h new file mode 100644 index 00000000..2270a2cd --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/common.h @@ -0,0 +1,963 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_COMMON_H +#define TENSORRT_COMMON_H + +// For loadLibrary +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#undef NOMINMAX +#else +#include +#endif + +#include "NvInfer.h" +#include "NvInferPlugin.h" +#include "logger.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "safeCommon.h" + +#ifdef _MSC_VER +#define FN_NAME __FUNCTION__ +#else +#define FN_NAME __func__ +#endif + +#if defined(__aarch64__) || defined(__QNX__) +#define ENABLE_DLA_API 1 +#endif + +#define CHECK_RETURN_W_MSG(status, val, errMsg) \ + do \ + { \ + if (!(status)) \ + { \ + sample::gLogError << errMsg << " Error in " << __FILE__ << ", function " << FN_NAME << "(), line " << __LINE__ \ + << std::endl; \ + return val; \ + } \ + } while (0) + +#undef ASSERT +#define ASSERT(condition) \ + do \ + { \ + if (!(condition)) \ + { \ + sample::gLogError << "Assertion failure: " << #condition << std::endl; \ + abort(); \ + } \ + } while (0) + + +#define CHECK_RETURN(status, val) CHECK_RETURN_W_MSG(status, val, "") + +#define OBJ_GUARD(A) std::unique_ptr + +template +OBJ_GUARD(T) +makeObjGuard(T_* t) +{ + CHECK(!(std::is_base_of::value || std::is_same::value)); + auto deleter = [](T* t) { t->destroy(); }; + return std::unique_ptr{static_cast(t), deleter}; +} + +constexpr long double operator"" _GiB(long double val) +{ + return val * (1 << 30); +} +constexpr long double operator"" _MiB(long double val) +{ + return val * (1 << 20); +} +constexpr long double operator"" _KiB(long double val) +{ + return val * (1 << 10); +} + +// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB. +// Since the return type is signed, -1_GiB will work as expected. +constexpr long long int operator"" _GiB(unsigned long long val) +{ + return val * (1 << 30); +} +constexpr long long int operator"" _MiB(unsigned long long val) +{ + return val * (1 << 20); +} +constexpr long long int operator"" _KiB(unsigned long long val) +{ + return val * (1 << 10); +} + +struct SimpleProfiler : public nvinfer1::IProfiler +{ + struct Record + { + float time{0}; + int count{0}; + }; + + virtual void reportLayerTime(const char* layerName, float ms) noexcept + { + mProfile[layerName].count++; + mProfile[layerName].time += ms; + if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end()) + { + mLayerNames.push_back(layerName); + } + } + + SimpleProfiler(const char* name, const std::vector& srcProfilers = std::vector()) + : mName(name) + { + for (const auto& srcProfiler : srcProfilers) + { + for (const auto& rec : srcProfiler.mProfile) + { + auto it = mProfile.find(rec.first); + if (it == mProfile.end()) + { + mProfile.insert(rec); + } + else + { + it->second.time += rec.second.time; + it->second.count += rec.second.count; + } + } + } + } + + friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value) + { + out << "========== " << value.mName << " profile ==========" << std::endl; + float totalTime = 0; + std::string layerNameStr = "TensorRT layer name"; + int maxLayerNameLength = std::max(static_cast(layerNameStr.size()), 70); + for (const auto& elem : value.mProfile) + { + totalTime += elem.second.time; + maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); + } + + auto old_settings = out.flags(); + auto old_precision = out.precision(); + // Output header + { + out << std::setw(maxLayerNameLength) << layerNameStr << " "; + out << std::setw(12) << "Runtime, " + << "%" + << " "; + out << std::setw(12) << "Invocations" + << " "; + out << std::setw(12) << "Runtime, ms" << std::endl; + } + for (size_t i = 0; i < value.mLayerNames.size(); i++) + { + const std::string layerName = value.mLayerNames[i]; + auto elem = value.mProfile.at(layerName); + out << std::setw(maxLayerNameLength) << layerName << " "; + out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%" + << " "; + out << std::setw(12) << elem.count << " "; + out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl; + } + out.flags(old_settings); + out.precision(old_precision); + out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl; + + return out; + } + +private: + std::string mName; + std::vector mLayerNames; + std::map mProfile; +}; + +//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in. +//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path. +inline std::string locateFile( + const std::string& filepathSuffix, const std::vector& directories, bool reportError = true) +{ + const int MAX_DEPTH{10}; + bool found{false}; + std::string filepath; + + for (auto& dir : directories) + { + if (!dir.empty() && dir.back() != '/') + { +#ifdef _MSC_VER + filepath = dir + "\\" + filepathSuffix; +#else + filepath = dir + "/" + filepathSuffix; +#endif + } + else + { + filepath = dir + filepathSuffix; + } + + for (int i = 0; i < MAX_DEPTH && !found; i++) + { + const std::ifstream checkFile(filepath); + found = checkFile.is_open(); + if (found) + { + break; + } + + filepath = "../" + filepath; // Try again in parent dir + } + + if (found) + { + break; + } + + filepath.clear(); + } + + // Could not find the file + if (filepath.empty()) + { + const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), + [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); + std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl; + + if (reportError) + { + std::cout << "&&&& FAILED" << std::endl; + exit(EXIT_FAILURE); + } + } + + return filepath; +} + +inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW) +{ + std::ifstream infile(fileName, std::ifstream::binary); + assert(infile.is_open() && "Attempting to read from a file that is not open."); + std::string magic, h, w, max; + infile >> magic >> h >> w >> max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(buffer), inH * inW); +} + +namespace samplesCommon +{ + +// Swaps endianness of an integral type. +template ::value, int>::type = 0> +inline T swapEndianness(const T& value) +{ + uint8_t bytes[sizeof(T)]; + for (int i = 0; i < static_cast(sizeof(T)); ++i) + { + bytes[sizeof(T) - 1 - i] = *(reinterpret_cast(&value) + i); + } + return *reinterpret_cast(bytes); +} + +class HostMemory +{ +public: + HostMemory() = delete; + virtual void* data() const noexcept + { + return mData; + } + virtual std::size_t size() const noexcept + { + return mSize; + } + virtual nvinfer1::DataType type() const noexcept + { + return mType; + } + virtual ~HostMemory() {} + +protected: + HostMemory(std::size_t size, nvinfer1::DataType type) + : mData{nullptr} + , mSize(size) + , mType(type) + { + } + void* mData; + std::size_t mSize; + nvinfer1::DataType mType; +}; + +template +class TypedHostMemory : public HostMemory +{ +public: + explicit TypedHostMemory(std::size_t size) + : HostMemory(size, dataType) + { + mData = new ElemType[size]; + }; + ~TypedHostMemory() noexcept + { + delete[](ElemType*) mData; + } + ElemType* raw() noexcept + { + return static_cast(data()); + } +}; + +using FloatMemory = TypedHostMemory; +using HalfMemory = TypedHostMemory; +using ByteMemory = TypedHostMemory; + +inline void* safeCudaMalloc(size_t memSize) +{ + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + return deviceMem; +} + +inline bool isDebug() +{ + return (std::getenv("TENSORRT_DEBUG") ? true : false); +} + +struct InferDeleter +{ + template + void operator()(T* obj) const + { +#if (NV_TENSORRT_MAJOR < 8) + obj->destroy(); +#else + delete obj; +#endif + } +}; + +template +using SampleUniquePtr = std::unique_ptr; + +static auto StreamDeleter = [](cudaStream_t* pStream) + { + if (pStream) + { + cudaStreamDestroy(*pStream); + delete pStream; + } + }; + +inline std::unique_ptr makeCudaStream() +{ + std::unique_ptr pStream(new cudaStream_t, StreamDeleter); + if (cudaStreamCreateWithFlags(pStream.get(), cudaStreamNonBlocking) != cudaSuccess) + { + pStream.reset(nullptr); + } + + return pStream; +} + +//! Return vector of indices that puts magnitudes of sequence in descending order. +template +std::vector argMagnitudeSort(Iter begin, Iter end) +{ + std::vector indices(end - begin); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&begin](size_t i, size_t j) { return std::abs(begin[j]) < std::abs(begin[i]); }); + return indices; +} + +inline bool readReferenceFile(const std::string& fileName, std::vector& refVector) +{ + std::ifstream infile(fileName); + if (!infile.is_open()) + { + std::cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << std::endl; + return false; + } + std::string line; + while (std::getline(infile, line)) + { + if (line.empty()) + continue; + refVector.push_back(line); + } + infile.close(); + return true; +} + +template +std::vector classify( + const std::vector& refVector, const std::vector& output, const size_t topK) +{ + const auto inds = samplesCommon::argMagnitudeSort(output.cbegin(), output.cend()); + std::vector result; + result.reserve(topK); + for (size_t k = 0; k < topK; ++k) + { + result.push_back(refVector[inds[k]]); + } + return result; +} + +// Returns indices of highest K magnitudes in v. +template +std::vector topKMagnitudes(const std::vector& v, const size_t k) +{ + std::vector indices = samplesCommon::argMagnitudeSort(v.cbegin(), v.cend()); + indices.resize(k); + return indices; +} + +template +bool readASCIIFile(const std::string& fileName, const size_t size, std::vector& out) +{ + std::ifstream infile(fileName); + if (!infile.is_open()) + { + std::cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << std::endl; + return false; + } + out.clear(); + out.reserve(size); + out.assign(std::istream_iterator(infile), std::istream_iterator()); + infile.close(); + return true; +} + +template +bool writeASCIIFile(const std::string& fileName, const std::vector& in) +{ + std::ofstream outfile(fileName); + if (!outfile.is_open()) + { + std::cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << std::endl; + return false; + } + for (auto fn : in) + { + outfile << fn << "\n"; + } + outfile.close(); + return true; +} + +inline void print_version() +{ + std::cout << " TensorRT version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH + << "." << NV_TENSORRT_BUILD << std::endl; +} + +inline std::string getFileType(const std::string& filepath) +{ + return filepath.substr(filepath.find_last_of(".") + 1); +} + +inline std::string toLower(const std::string& inp) +{ + std::string out = inp; + std::transform(out.begin(), out.end(), out.begin(), ::tolower); + return out; +} + +inline float getMaxValue(const float* buffer, int64_t size) +{ + assert(buffer != nullptr); + assert(size > 0); + return *std::max_element(buffer, buffer + size); +} + +// Ensures that every tensor used by a network has a dynamic range set. +// +// All tensors in a network must have a dynamic range specified if a calibrator is not used. +// This function is just a utility to globally fill in missing scales and zero-points for the entire network. +// +// If a tensor does not have a dyanamic range set, it is assigned inRange or outRange as follows: +// +// * If the tensor is the input to a layer or output of a pooling node, its dynamic range is derived from inRange. +// * Otherwise its dynamic range is derived from outRange. +// +// The default parameter values are intended to demonstrate, for final layers in the network, +// cases where dynamic ranges are asymmetric. +// +// The default parameter values choosen arbitrarily. Range values should be choosen such that +// we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor. +inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f) +{ + // Ensure that all layer inputs have a scale. + for (int i = 0; i < network->getNbLayers(); i++) + { + auto layer = network->getLayer(i); + for (int j = 0; j < layer->getNbInputs(); j++) + { + nvinfer1::ITensor* input{layer->getInput(j)}; + // Optional inputs are nullptr here and are from RNN layers. + if (input != nullptr && !input->dynamicRangeIsSet()) + { + ASSERT(input->setDynamicRange(-inRange, inRange)); + } + } + } + + // Ensure that all layer outputs have a scale. + // Tensors that are also inputs to layers are ingored here + // since the previous loop nest assigned scales to them. + for (int i = 0; i < network->getNbLayers(); i++) + { + auto layer = network->getLayer(i); + for (int j = 0; j < layer->getNbOutputs(); j++) + { + nvinfer1::ITensor* output{layer->getOutput(j)}; + // Optional outputs are nullptr here and are from RNN layers. + if (output != nullptr && !output->dynamicRangeIsSet()) + { + // Pooling must have the same input and output scales. + if (layer->getType() == nvinfer1::LayerType::kPOOLING) + { + ASSERT(output->setDynamicRange(-inRange, inRange)); + } + else + { + ASSERT(output->setDynamicRange(-outRange, outRange)); + } + } + } + } +} + +inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer1::INetworkDefinition* n) +{ + // Set dummy per-tensor dynamic range if Int8 mode is requested. + if (c->getFlag(nvinfer1::BuilderFlag::kINT8)) + { + sample::gLogWarning + << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy is not guaranteed." + << std::endl; + setAllDynamicRanges(n); + } +} + +inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) +{ + if (useDLACore >= 0) + { + if (builder->getNbDLACores() == 0) + { + std::cerr << "Trying to use DLA core " << useDLACore << " on a platform that doesn't have any DLA cores" + << std::endl; + assert("Error: use DLA core on a platfrom that doesn't have any DLA cores" && false); + } + if (allowGPUFallback) + { + config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); + } + if (!config->getFlag(nvinfer1::BuilderFlag::kINT8)) + { + // User has not requested INT8 Mode. + // By default run in FP16 mode. FP32 mode is not permitted. + config->setFlag(nvinfer1::BuilderFlag::kFP16); + } + config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); + config->setDLACore(useDLACore); + } +} + +inline int32_t parseDLA(int32_t argc, char** argv) +{ + for (int32_t i = 1; i < argc; i++) + { + if (strncmp(argv[i], "--useDLACore=", 13) == 0) + { + return std::stoi(argv[i] + 13); + } + } + return -1; +} + +inline uint32_t getElementSize(nvinfer1::DataType t) noexcept +{ + switch (t) + { + case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kINT8: return 1; + } + return 0; +} + +inline int64_t volume(const nvinfer1::Dims& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); +} + +template +struct PPM +{ + std::string magic, fileName; + int h, w, max; + uint8_t buffer[C * H * W]; +}; + +// New vPPM(variable sized PPM) class with variable dimensions. +struct vPPM +{ + std::string magic, fileName; + int h, w, max; + std::vector buffer; +}; + +struct BBox +{ + float x1, y1, x2, y2; +}; + +template +void readPPMFile(const std::string& filename, samplesCommon::PPM& ppm) +{ + ppm.fileName = filename; + std::ifstream infile(filename, std::ifstream::binary); + assert(infile.is_open() && "Attempting to read from a file that is not open."); + infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +inline void readPPMFile(const std::string& filename, vPPM& ppm, std::vector& input_dir) +{ + ppm.fileName = filename; + std::ifstream infile(locateFile(filename, input_dir), std::ifstream::binary); + infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; + infile.seekg(1, infile.cur); + + for (int i = 0; i < ppm.w * ppm.h * 3; ++i) + { + ppm.buffer.push_back(0); + } + + infile.read(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); +} + +template +void writePPMFileWithBBox(const std::string& filename, PPM& ppm, const BBox& bbox) +{ + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" + << "\n" + << ppm.w << " " << ppm.h << "\n" + << ppm.max << "\n"; + + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1); + const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1); + const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1); + const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1); + + for (int x = x1; x <= x2; ++x) + { + // bbox top border + ppm.buffer[(y1 * ppm.w + x) * 3] = 255; + ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(y2 * ppm.w + x) * 3] = 255; + ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0; + } + + for (int y = y1; y <= y2; ++y) + { + // bbox left border + ppm.buffer[(y * ppm.w + x1) * 3] = 255; + ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + x2) * 3] = 255; + ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0; + } + + outfile.write(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vector& dets) +{ + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" + << "\n" + << ppm.w << " " << ppm.h << "\n" + << ppm.max << "\n"; + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + + for (auto bbox : dets) + { + for (int x = int(bbox.x1); x < int(bbox.x2); ++x) + { + // bbox top border + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3] = 255; + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3] = 255; + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 2] = 0; + } + + for (int y = int(bbox.y1); y < int(bbox.y2); ++y) + { + // bbox left border + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3] = 255; + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3] = 255; + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 2] = 0; + } + } + + outfile.write(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); +} + +class TimerBase +{ +public: + virtual void start() {} + virtual void stop() {} + float microseconds() const noexcept + { + return mMs * 1000.f; + } + float milliseconds() const noexcept + { + return mMs; + } + float seconds() const noexcept + { + return mMs / 1000.f; + } + void reset() noexcept + { + mMs = 0.f; + } + +protected: + float mMs{0.0f}; +}; + +class GpuTimer : public TimerBase +{ +public: + explicit GpuTimer(cudaStream_t stream) + : mStream(stream) + { + CHECK(cudaEventCreate(&mStart)); + CHECK(cudaEventCreate(&mStop)); + } + ~GpuTimer() + { + CHECK(cudaEventDestroy(mStart)); + CHECK(cudaEventDestroy(mStop)); + } + void start() + { + CHECK(cudaEventRecord(mStart, mStream)); + } + void stop() + { + CHECK(cudaEventRecord(mStop, mStream)); + float ms{0.0f}; + CHECK(cudaEventSynchronize(mStop)); + CHECK(cudaEventElapsedTime(&ms, mStart, mStop)); + mMs += ms; + } + +private: + cudaEvent_t mStart, mStop; + cudaStream_t mStream; +}; // class GpuTimer + +template +class CpuTimer : public TimerBase +{ +public: + using clock_type = Clock; + + void start() + { + mStart = Clock::now(); + } + void stop() + { + mStop = Clock::now(); + mMs += std::chrono::duration{mStop - mStart}.count(); + } + +private: + std::chrono::time_point mStart, mStop; +}; // class CpuTimer + +using PreciseCpuTimer = CpuTimer; + +inline std::vector splitString(std::string str, char delimiter = ',') +{ + std::vector splitVect; + std::stringstream ss(str); + std::string substr; + + while (ss.good()) + { + getline(ss, substr, delimiter); + splitVect.emplace_back(std::move(substr)); + } + return splitVect; +} + +// Return m rounded up to nearest multiple of n +inline int roundUp(int m, int n) +{ + return ((m + n - 1) / n) * n; +} + +inline int getC(const nvinfer1::Dims& d) +{ + return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1; +} + +inline int getH(const nvinfer1::Dims& d) +{ + return d.nbDims >= 2 ? d.d[d.nbDims - 2] : 1; +} + +inline int getW(const nvinfer1::Dims& d) +{ + return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1; +} + +inline void loadLibrary(const std::string& path) +{ +#ifdef _MSC_VER + void* handle = LoadLibrary(path.c_str()); +#else + int32_t flags{RTLD_LAZY}; +#if ENABLE_ASAN + // https://github.com/google/sanitizers/issues/89 + // asan doesn't handle module unloading correctly and there are no plans on doing + // so. In order to get proper stack traces, don't delete the shared library on + // close so that asan can resolve the symbols correctly. + flags |= RTLD_NODELETE; +#endif // ENABLE_ASAN + + void* handle = dlopen(path.c_str(), flags); +#endif + if (handle == nullptr) + { +#ifdef _MSC_VER + sample::gLogError << "Could not load plugin library: " << path << std::endl; +#else + sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl; +#endif + } +} + +inline int32_t getSMVersion() +{ + int32_t deviceIndex = 0; + CHECK(cudaGetDevice(&deviceIndex)); + + int32_t major, minor; + CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); + CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); + + return ((major << 8) | minor); +} + +inline bool isSMSafe() +{ + const int32_t smVersion = getSMVersion(); + return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 || + smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807; +} + +inline bool isDataTypeSupported(nvinfer1::DataType dataType) +{ + auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); + if (!builder) + { + return false; + } + + if ((dataType == nvinfer1::DataType::kINT8 && !builder->platformHasFastInt8()) + || (dataType == nvinfer1::DataType::kHALF && !builder->platformHasFastFp16())) + { + return false; + } + + return true; +} + +} // namespace samplesCommon + +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) +{ + os << "("; + for (int i = 0; i < dims.nbDims; ++i) + { + os << (i ? ", " : "") << dims.d[i]; + } + return os << ")"; +} + +#endif // TENSORRT_COMMON_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/half.h b/src/Detector/tensorrt_yolo/common_deprecated/half.h new file mode 100644 index 00000000..0755c316 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/half.h @@ -0,0 +1,4302 @@ +// half - IEEE 754-based half-precision floating point library. +// +// Copyright (c) 2012-2017 Christian Rau +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Version 1.12.0 + +/// \file +/// Main header file for half precision functionality. + +#ifndef HALF_HALF_HPP +#define HALF_HALF_HPP + +/// Combined gcc version number. +#define HALF_GNUC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +// check C++11 language features +#if defined(__clang__) // clang +#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +/*#elif defined(__INTEL_COMPILER) //Intel C++ + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) ???????? + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) ???????? + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) ???????? + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG) ???????? + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif*/ +#elif defined(__GNUC__) // gcc +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +#endif +#elif defined(_MSC_VER) // Visual C++ +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +#define HALF_POP_WARNINGS 1 +#pragma warning(push) +#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, negative unsigned +#endif + +// check C++11 library features +#include +#if defined(_LIBCPP_VERSION) // libc++ +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 +#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#ifndef HALF_ENABLE_CPP11_CSTDINT +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#ifndef HALF_ENABLE_CPP11_CMATH +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#ifndef HALF_ENABLE_CPP11_HASH +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#elif defined(__GLIBCXX__) // libstdc++ +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 +#ifdef __clang__ +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS) +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT) +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH) +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH) +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#else +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT) +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH) +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH) +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#endif +#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++ +#if _CPPLIB_VER >= 520 +#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#ifndef HALF_ENABLE_CPP11_CSTDINT +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#ifndef HALF_ENABLE_CPP11_HASH +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#if _CPPLIB_VER >= 610 +#ifndef HALF_ENABLE_CPP11_CMATH +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#endif +#endif +#undef HALF_GNUC_VERSION + +// support constexpr +#if HALF_ENABLE_CPP11_CONSTEXPR +#define HALF_CONSTEXPR constexpr +#define HALF_CONSTEXPR_CONST constexpr +#else +#define HALF_CONSTEXPR +#define HALF_CONSTEXPR_CONST const +#endif + +// support noexcept +#if HALF_ENABLE_CPP11_NOEXCEPT +#define HALF_NOEXCEPT noexcept +#define HALF_NOTHROW noexcept +#else +#define HALF_NOEXCEPT +#define HALF_NOTHROW throw() +#endif + +#include +#include +#include +#include +#include +#include +#if HALF_ENABLE_CPP11_TYPE_TRAITS +#include +#endif +#if HALF_ENABLE_CPP11_CSTDINT +#include +#endif +#if HALF_ENABLE_CPP11_HASH +#include +#endif + +/// Default rounding mode. +/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as +/// well as for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including +/// half.hpp) to one of the standard rounding modes using their respective constants or the equivalent values of +/// `std::float_round_style`: +/// +/// `std::float_round_style` | value | rounding +/// ---------------------------------|-------|------------------------- +/// `std::round_indeterminate` | -1 | fastest (default) +/// `std::round_toward_zero` | 0 | toward zero +/// `std::round_to_nearest` | 1 | to nearest +/// `std::round_toward_infinity` | 2 | toward positive infinity +/// `std::round_toward_neg_infinity` | 3 | toward negative infinity +/// +/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with +/// overflows set to infinity) and is the fastest rounding mode possible. It can even be set to +/// `std::numeric_limits::round_style` to synchronize the rounding mode with that of the underlying +/// single-precision implementation. +#ifndef HALF_ROUND_STYLE +#define HALF_ROUND_STYLE 1 // = std::round_to_nearest +#endif + +/// Tie-breaking behaviour for round to nearest. +/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this +/// is defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way +/// cases (and thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more +/// IEEE-conformant behaviour is needed. +#ifndef HALF_ROUND_TIES_TO_EVEN +#define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero +#endif + +/// Value signaling overflow. +/// In correspondence with `HUGE_VAL[F|L]` from `` this symbol expands to a positive value signaling the overflow +/// of an operation, in particular it just evaluates to positive infinity. +#define HUGE_VALH std::numeric_limits::infinity() + +/// Fast half-precision fma function. +/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate +/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all +/// arithmetic operations, this is in fact always the case. +#define FP_FAST_FMAH 1 + +#ifndef FP_ILOGB0 +#define FP_ILOGB0 INT_MIN +#endif +#ifndef FP_ILOGBNAN +#define FP_ILOGBNAN INT_MAX +#endif +#ifndef FP_SUBNORMAL +#define FP_SUBNORMAL 0 +#endif +#ifndef FP_ZERO +#define FP_ZERO 1 +#endif +#ifndef FP_NAN +#define FP_NAN 2 +#endif +#ifndef FP_INFINITE +#define FP_INFINITE 3 +#endif +#ifndef FP_NORMAL +#define FP_NORMAL 4 +#endif + +/// Main namespace for half precision functionality. +/// This namespace contains all the functionality provided by the library. +namespace half_float +{ +class half; + +#if HALF_ENABLE_CPP11_USER_LITERALS +/// Library-defined half-precision literals. +/// Import this namespace to enable half-precision floating point literals: +/// ~~~~{.cpp} +/// using namespace half_float::literal; +/// half_float::half = 4.2_h; +/// ~~~~ +namespace literal +{ +half operator"" _h(long double); +} +#endif + +/// \internal +/// \brief Implementation details. +namespace detail +{ +#if HALF_ENABLE_CPP11_TYPE_TRAITS +/// Conditional type. +template +struct conditional : std::conditional +{ +}; + +/// Helper for tag dispatching. +template +struct bool_type : std::integral_constant +{ +}; +using std::false_type; +using std::true_type; + +/// Type traits for floating point types. +template +struct is_float : std::is_floating_point +{ +}; +#else +/// Conditional type. +template +struct conditional +{ + typedef T type; +}; +template +struct conditional +{ + typedef F type; +}; + +/// Helper for tag dispatching. +template +struct bool_type +{ +}; +typedef bool_type true_type; +typedef bool_type false_type; + +/// Type traits for floating point types. +template +struct is_float : false_type +{ +}; +template +struct is_float : is_float +{ +}; +template +struct is_float : is_float +{ +}; +template +struct is_float : is_float +{ +}; +template <> +struct is_float : true_type +{ +}; +template <> +struct is_float : true_type +{ +}; +template <> +struct is_float : true_type +{ +}; +#endif + +/// Type traits for floating point bits. +template +struct bits +{ + typedef unsigned char type; +}; +template +struct bits : bits +{ +}; +template +struct bits : bits +{ +}; +template +struct bits : bits +{ +}; + +#if HALF_ENABLE_CPP11_CSTDINT +/// Unsigned integer of (at least) 16 bits width. +typedef std::uint_least16_t uint16; + +/// Unsigned integer of (at least) 32 bits width. +template <> +struct bits +{ + typedef std::uint_least32_t type; +}; + +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits +{ + typedef std::uint_least64_t type; +}; +#else +/// Unsigned integer of (at least) 16 bits width. +typedef unsigned short uint16; + +/// Unsigned integer of (at least) 32 bits width. +template <> +struct bits : conditional::digits >= 32, unsigned int, unsigned long> +{ +}; + +#if HALF_ENABLE_CPP11_LONG_LONG +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits : conditional::digits >= 64, unsigned long, unsigned long long> +{ +}; +#else +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits +{ + typedef unsigned long type; +}; +#endif +#endif + +/// Tag type for binary construction. +struct binary_t +{ +}; + +/// Tag for binary construction. +HALF_CONSTEXPR_CONST binary_t binary = binary_t(); + +/// Temporary half-precision expression. +/// This class represents a half-precision expression which just stores a single-precision value internally. +struct expr +{ + /// Conversion constructor. + /// \param f single-precision value to convert + explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + HALF_CONSTEXPR operator float() const HALF_NOEXCEPT + { + return value_; + } + +private: + /// Internal expression value stored in single-precision. + float value_; +}; + +/// SFINAE helper for generic half-precision functions. +/// This class template has to be specialized for each valid combination of argument types to provide a corresponding +/// `type` member equivalent to \a T. +/// \tparam T type to return +template +struct enable +{ +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; + +/// Return type for specialized generic 2-argument half-precision functions. +/// This class template has to be specialized for each valid combination of argument types to provide a corresponding +/// `type` member denoting the appropriate return type. +/// \tparam T first argument type +/// \tparam U first argument type +template +struct result : enable +{ +}; +template <> +struct result +{ + typedef half type; +}; + +/// \name Classification helpers +/// \{ + +/// Check for infinity. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if infinity +/// \retval false else +template +bool builtin_isinf(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::isinf(arg); +#elif defined(_MSC_VER) + return !::_finite(static_cast(arg)) && !::_isnan(static_cast(arg)); +#else + return arg == std::numeric_limits::infinity() || arg == -std::numeric_limits::infinity(); +#endif +} + +/// Check for NaN. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if not a number +/// \retval false else +template +bool builtin_isnan(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::isnan(arg); +#elif defined(_MSC_VER) + return ::_isnan(static_cast(arg)) != 0; +#else + return arg != arg; +#endif +} + +/// Check sign. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if signbit set +/// \retval false else +template +bool builtin_signbit(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::signbit(arg); +#else + return arg < T() || (arg == T() && T(1) / arg < T()); +#endif +} + +/// \} +/// \name Conversion +/// \{ + +/// Convert IEEE single-precision to half-precision. +/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value single-precision value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(float value, true_type) +{ + typedef bits::type uint32; + uint32 bits; // = *reinterpret_cast(&value); //violating strict aliasing! + std::memcpy(&bits, &value, sizeof(float)); + /* uint16 hbits = (bits>>16) & 0x8000; + bits &= 0x7FFFFFFF; + int exp = bits >> 23; + if(exp == 255) + return hbits | 0x7C00 | (0x3FF&-static_cast((bits&0x7FFFFF)!=0)); + if(exp > 142) + { + if(R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits>>15); + if(R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits>>15); + return hbits | 0x7BFF + (R!=std::round_toward_zero); + } + int g, s; + if(exp > 112) + { + g = (bits>>12) & 1; + s = (bits&0xFFF) != 0; + hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF); + } + else if(exp > 101) + { + int i = 125 - exp; + bits = (bits&0x7FFFFF) | 0x800000; + g = (bits>>i) & 1; + s = (bits&((1L<> (i+1); + } + else + { + g = 0; + s = bits != 0; + } + if(R == std::round_to_nearest) + #if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s|hbits); + #else + hbits += g; + #endif + else if(R == std::round_toward_infinity) + hbits += ~(hbits>>15) & (s|g); + else if(R == std::round_toward_neg_infinity) + hbits += (hbits>>15) & (g|s); + */ + static const uint16 base_table[512] = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, + 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, + 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, + 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, + 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000, 0xC400, 0xC800, + 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00}; + static const unsigned char shift_table[512] = {24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13}; + uint16 hbits = base_table[bits >> 23] + static_cast((bits & 0x7FFFFF) >> shift_table[bits >> 23]); + if (R == std::round_to_nearest) + hbits += (((bits & 0x7FFFFF) >> (shift_table[bits >> 23] - 1)) | (((bits >> 23) & 0xFF) == 102)) + & ((hbits & 0x7C00) != 0x7C00) +#if HALF_ROUND_TIES_TO_EVEN + & (((((static_cast(1) << (shift_table[bits >> 23] - 1)) - 1) & bits) != 0) | hbits) +#endif + ; + else if (R == std::round_toward_zero) + hbits -= ((hbits & 0x7FFF) == 0x7C00) & ~shift_table[bits >> 23]; + else if (R == std::round_toward_infinity) + hbits += ((((bits & 0x7FFFFF & ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) + | (((bits >> 23) <= 102) & ((bits >> 23) != 0))) + & (hbits < 0x7C00)) + - ((hbits == 0xFC00) & ((bits >> 23) != 511)); + else if (R == std::round_toward_neg_infinity) + hbits += ((((bits & 0x7FFFFF & ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) + | (((bits >> 23) <= 358) & ((bits >> 23) != 256))) + & (hbits < 0xFC00) & (hbits >> 15)) + - ((hbits == 0x7C00) & ((bits >> 23) != 255)); + return hbits; +} + +/// Convert IEEE double-precision to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value double-precision value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(double value, true_type) +{ + typedef bits::type uint32; + typedef bits::type uint64; + uint64 bits; // = *reinterpret_cast(&value); //violating strict aliasing! + std::memcpy(&bits, &value, sizeof(double)); + uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF; + uint16 hbits = (hi >> 16) & 0x8000; + hi &= 0x7FFFFFFF; + int exp = hi >> 20; + if (exp == 2047) + return hbits | 0x7C00 | (0x3FF & -static_cast((bits & 0xFFFFFFFFFFFFF) != 0)); + if (exp > 1038) + { + if (R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits >> 15); + if (R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits >> 15); + return hbits | 0x7BFF + (R != std::round_toward_zero); + } + int g, s = lo != 0; + if (exp > 1008) + { + g = (hi >> 9) & 1; + s |= (hi & 0x1FF) != 0; + hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF); + } + else if (exp > 997) + { + int i = 1018 - exp; + hi = (hi & 0xFFFFF) | 0x100000; + g = (hi >> i) & 1; + s |= (hi & ((1L << i) - 1)) != 0; + hbits |= hi >> (i + 1); + } + else + { + g = 0; + s |= hi != 0; + } + if (R == std::round_to_nearest) +#if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s | hbits); +#else + hbits += g; +#endif + else if (R == std::round_toward_infinity) + hbits += ~(hbits >> 15) & (s | g); + else if (R == std::round_toward_neg_infinity) + hbits += (hbits >> 15) & (g | s); + return hbits; +} + +/// Convert non-IEEE floating point to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T source type (builtin floating point type) +/// \param value floating point value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(T value, ...) +{ + uint16 hbits = static_cast(builtin_signbit(value)) << 15; + if (value == T()) + return hbits; + if (builtin_isnan(value)) + return hbits | 0x7FFF; + if (builtin_isinf(value)) + return hbits | 0x7C00; + int exp; + std::frexp(value, &exp); + if (exp > 16) + { + if (R == std::round_toward_infinity) + return hbits | (0x7C00 - (hbits >> 15)); + else if (R == std::round_toward_neg_infinity) + return hbits | (0x7BFF + (hbits >> 15)); + return hbits | (0x7BFF + (R != std::round_toward_zero)); + } + if (exp < -13) + value = std::ldexp(value, 24); + else + { + value = std::ldexp(value, 11 - exp); + hbits |= ((exp + 13) << 10); + } + T ival, frac = std::modf(value, &ival); + hbits += static_cast(std::abs(static_cast(ival))); + if (R == std::round_to_nearest) + { + frac = std::abs(frac); +#if HALF_ROUND_TIES_TO_EVEN + hbits += (frac > T(0.5)) | ((frac == T(0.5)) & hbits); +#else + hbits += frac >= T(0.5); +#endif + } + else if (R == std::round_toward_infinity) + hbits += frac > T(); + else if (R == std::round_toward_neg_infinity) + hbits += frac < T(); + return hbits; +} + +/// Convert floating point to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T source type (builtin floating point type) +/// \param value floating point value +/// \return binary representation of half-precision value +template +uint16 float2half(T value) +{ + return float2half_impl( + value, bool_type < std::numeric_limits::is_iec559 && sizeof(typename bits::type) == sizeof(T) > ()); +} + +/// Convert integer to half-precision floating point. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam S `true` if value negative, `false` else +/// \tparam T type to convert (builtin integer type) +/// \param value non-negative integral value +/// \return binary representation of half-precision value +template +uint16 int2half_impl(T value) +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, "int to half conversion only supports builtin integer types"); +#endif + if (S) + value = -value; + uint16 bits = S << 15; + if (value > 0xFFFF) + { + if (R == std::round_toward_infinity) + bits |= 0x7C00 - S; + else if (R == std::round_toward_neg_infinity) + bits |= 0x7BFF + S; + else + bits |= 0x7BFF + (R != std::round_toward_zero); + } + else if (value) + { + uint32_t m = value, exp = 24; + for (; m < 0x400; m <<= 1, --exp) + ; + for (; m > 0x7FF; m >>= 1, ++exp) + ; + bits |= (exp << 10) + m; + if (exp > 24) + { + if (R == std::round_to_nearest) + bits += (value >> (exp - 25)) & 1 +#if HALF_ROUND_TIES_TO_EVEN + & (((((1 << (exp - 25)) - 1) & value) != 0) | bits) +#endif + ; + else if (R == std::round_toward_infinity) + bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & !S; + else if (R == std::round_toward_neg_infinity) + bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & S; + } + } + return bits; +} + +/// Convert integer to half-precision floating point. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T type to convert (builtin integer type) +/// \param value integral value +/// \return binary representation of half-precision value +template +uint16 int2half(T value) +{ + return (value < 0) ? int2half_impl(value) : int2half_impl(value); +} + +/// Convert half-precision to IEEE single-precision. +/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). +/// \param value binary representation of half-precision value +/// \return single-precision value +inline float half2float_impl(uint16 value, float, true_type) +{ + typedef bits::type uint32; + /* uint32 bits = static_cast(value&0x8000) << 16; + int abs = value & 0x7FFF; + if(abs) + { + bits |= 0x38000000 << static_cast(abs>=0x7C00); + for(; abs<0x400; abs<<=1,bits-=0x800000) ; + bits += static_cast(abs) << 13; + } + */ + static const uint32 mantissa_table[2048] = {0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, + 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, + 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, + 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000, + 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, + 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000, 0x36440000, 0x36480000, + 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, + 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, + 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, + 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, + 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, + 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000, + 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, + 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, + 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, + 0x371F0000, 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 0x37300000, + 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, + 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, + 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, + 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, + 0x375E0000, 0x375F0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, + 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, + 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, + 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, + 0x378E8000, 0x378F0000, 0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, + 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, + 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000, + 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, + 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 0x37A80000, 0x37A88000, 0x37A90000, + 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, + 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, + 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, + 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, + 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, + 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 0x37C80000, 0x37C88000, + 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, + 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, + 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, + 0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, + 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, + 0x37DF8000, 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, + 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 0x37E80000, + 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, + 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000, + 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, + 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, + 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, + 0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, + 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, + 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000, + 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, + 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, + 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, + 0x380F4000, 0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, + 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, + 0x3813C000, 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, + 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000, + 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, + 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, + 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, + 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, + 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, + 0x38238000, 0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, + 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, + 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, + 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, + 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000, 0x3830C000, + 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, + 0x38334000, 0x38338000, 0x3833C000, 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, + 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, + 0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, + 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000, + 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, + 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000, 0x38408000, + 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, + 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, + 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, + 0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, + 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, + 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000, 0x38504000, + 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, + 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, + 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, + 0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, + 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, + 0x385BC000, 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, + 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 0x38600000, + 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, + 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, + 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, + 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, + 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, + 0x386B8000, 0x386BC000, 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, + 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, + 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, + 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, + 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000, 0x3878C000, + 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, + 0x387B4000, 0x387B8000, 0x387BC000, 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, + 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, + 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000, + 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000, 0x38042000, 0x38044000, + 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, + 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, + 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, + 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, + 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 0x380C0000, 0x380C2000, + 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, + 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, + 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, + 0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, + 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, + 0x3811E000, 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 0x38140000, + 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000, + 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, + 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, + 0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, + 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, + 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, + 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, + 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, + 0x3821A000, 0x3821C000, 0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, + 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, + 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, + 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 0x38280000, 0x38282000, 0x38284000, + 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, + 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, + 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, + 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, + 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, + 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, + 0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, + 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, + 0x3835E000, 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 0x38380000, + 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000, + 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, + 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, + 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, + 0x383DC000, 0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, + 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, + 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, + 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, + 0x3845A000, 0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, + 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, + 0x3847E000, 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000, + 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, + 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, + 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, + 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, + 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, + 0x384FC000, 0x384FE000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, + 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, + 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, + 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, + 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, + 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, + 0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, + 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000, + 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, + 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000, 0x385E4000, + 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, + 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, + 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, + 0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, + 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, + 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, + 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, + 0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, + 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, + 0x386BE000, 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, + 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 0x386E0000, + 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, + 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000, + 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, + 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, + 0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, + 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, + 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, + 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, + 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, + 0x387BA000, 0x387BC000, 0x387BE000, 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, + 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, + 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, + 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000}; + static const uint32 exponent_table[64] = {0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, + 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, + 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000, + 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000, 0x88800000, 0x89000000, + 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, + 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000}; + static const unsigned short offset_table[64] = {0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024}; + uint32 bits = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10]; + // return *reinterpret_cast(&bits); //violating strict aliasing! + float out; + std::memcpy(&out, &bits, sizeof(float)); + return out; +} + +/// Convert half-precision to IEEE double-precision. +/// \param value binary representation of half-precision value +/// \return double-precision value +inline double half2float_impl(uint16 value, double, true_type) +{ + typedef bits::type uint32; + typedef bits::type uint64; + uint32 hi = static_cast(value & 0x8000) << 16; + int abs = value & 0x7FFF; + if (abs) + { + hi |= 0x3F000000 << static_cast(abs >= 0x7C00); + for (; abs < 0x400; abs <<= 1, hi -= 0x100000) + ; + hi += static_cast(abs) << 10; + } + uint64 bits = static_cast(hi) << 32; + // return *reinterpret_cast(&bits); //violating strict aliasing! + double out; + std::memcpy(&out, &bits, sizeof(double)); + return out; +} + +/// Convert half-precision to non-IEEE floating point. +/// \tparam T type to convert to (builtin integer type) +/// \param value binary representation of half-precision value +/// \return floating point value +template +T half2float_impl(uint16 value, T, ...) +{ + T out; + int abs = value & 0x7FFF; + if (abs > 0x7C00) + out = std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : T(); + else if (abs == 0x7C00) + out = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() : std::numeric_limits::max(); + else if (abs > 0x3FF) + out = std::ldexp(static_cast((abs & 0x3FF) | 0x400), (abs >> 10) - 25); + else + out = std::ldexp(static_cast(abs), -24); + return (value & 0x8000) ? -out : out; +} + +/// Convert half-precision to floating point. +/// \tparam T type to convert to (builtin integer type) +/// \param value binary representation of half-precision value +/// \return floating point value +template +T half2float(uint16 value) +{ + return half2float_impl( + value, T(), bool_type < std::numeric_limits::is_iec559 && sizeof(typename bits::type) == sizeof(T) > ()); +} + +/// Convert half-precision floating point to integer. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam E `true` for round to even, `false` for round away from zero +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int_impl(uint16 value) +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, "half to int conversion only supports builtin integer types"); +#endif + uint32_t e = value & 0x7FFF; + if (e >= 0x7C00) + return (value & 0x8000) ? std::numeric_limits::min() : std::numeric_limits::max(); + if (e < 0x3800) + { + if (R == std::round_toward_infinity) + return T(~(value >> 15) & (e != 0)); + else if (R == std::round_toward_neg_infinity) + return -T(value > 0x8000); + return T(); + } + uint32_t m = (value & 0x3FF) | 0x400; + e >>= 10; + if (e < 25) + { + if (R == std::round_to_nearest) + m += (1 << (24 - e)) - (~(m >> (25 - e)) & E); + else if (R == std::round_toward_infinity) + m += ((value >> 15) - 1) & ((1 << (25 - e)) - 1U); + else if (R == std::round_toward_neg_infinity) + m += -(value >> 15) & ((1 << (25 - e)) - 1U); + m >>= 25 - e; + } + else + m <<= e - 25; + return (value & 0x8000) ? -static_cast(m) : static_cast(m); +} + +/// Convert half-precision floating point to integer. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int(uint16 value) +{ + return half2int_impl(value); +} + +/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero. +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int_up(uint16 value) +{ + return half2int_impl(value); +} + +/// Round half-precision number to nearest integer value. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam E `true` for round to even, `false` for round away from zero +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +template +uint16 round_half_impl(uint16 value) +{ + uint32_t e = value & 0x7FFF; + uint16 result = value; + if (e < 0x3C00) + { + result &= 0x8000; + if (R == std::round_to_nearest) + result |= 0x3C00U & -(e >= (0x3800 + E)); + else if (R == std::round_toward_infinity) + result |= 0x3C00U & -(~(value >> 15) & (e != 0)); + else if (R == std::round_toward_neg_infinity) + result |= 0x3C00U & -(value > 0x8000); + } + else if (e < 0x6400) + { + e = 25 - (e >> 10); + uint32_t mask = (1 << e) - 1; + if (R == std::round_to_nearest) + result += (1 << (e - 1)) - (~(result >> e) & E); + else if (R == std::round_toward_infinity) + result += mask & ((value >> 15) - 1); + else if (R == std::round_toward_neg_infinity) + result += mask & -(value >> 15); + result &= ~mask; + } + return result; +} + +/// Round half-precision number to nearest integer value. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +template +uint16 round_half(uint16 value) +{ + return round_half_impl(value); +} + +/// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero. +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +inline uint16 round_half_up(uint16 value) +{ + return round_half_impl(value); +} +/// \} + +struct functions; +template +struct unary_specialized; +template +struct binary_specialized; +template +struct half_caster; +} // namespace detail + +/// Half-precision floating point type. +/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and +/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and +/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations +/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to +/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic +/// expressions are kept in single-precision as long as possible (while of course still maintaining a strong +/// half-precision type). +/// +/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and +/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which +/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the +/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be +/// of exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will +/// most probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying +/// 16-bit IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 +/// bits if your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the +/// case on nearly any reasonable platform. +/// +/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable +/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation. +class half +{ + friend struct detail::functions; + friend struct detail::unary_specialized; + friend struct detail::binary_specialized; + template + friend struct detail::half_caster; + friend class std::numeric_limits; +#if HALF_ENABLE_CPP11_HASH + friend struct std::hash; +#endif +#if HALF_ENABLE_CPP11_USER_LITERALS + friend half literal::operator"" _h(long double); +#endif + +public: + /// Default constructor. + /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics + /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics. + HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {} + + /// Copy constructor. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + half(detail::expr rhs) + : data_(detail::float2half(static_cast(rhs))) + { + } + + /// Conversion constructor. + /// \param rhs float to convert + explicit half(float rhs) + : data_(detail::float2half(rhs)) + { + } + + /// Conversion to single-precision. + /// \return single precision value representing expression value + operator float() const + { + return detail::half2float(data_); + } + + /// Assignment operator. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + /// \return reference to this half + half& operator=(detail::expr rhs) + { + return *this = static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to add + /// \return reference to this half + template + typename detail::enable::type operator+=(T rhs) + { + return *this += static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to subtract + /// \return reference to this half + template + typename detail::enable::type operator-=(T rhs) + { + return *this -= static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to multiply with + /// \return reference to this half + template + typename detail::enable::type operator*=(T rhs) + { + return *this *= static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to divide by + /// \return reference to this half + template + typename detail::enable::type operator/=(T rhs) + { + return *this /= static_cast(rhs); + } + + /// Assignment operator. + /// \param rhs single-precision value to copy from + /// \return reference to this half + half& operator=(float rhs) + { + data_ = detail::float2half(rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to add + /// \return reference to this half + half& operator+=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) + rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to subtract + /// \return reference to this half + half& operator-=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) - rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to multiply with + /// \return reference to this half + half& operator*=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) * rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to divide by + /// \return reference to this half + half& operator/=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) / rhs); + return *this; + } + + /// Prefix increment. + /// \return incremented half value + half& operator++() + { + return *this += 1.0f; + } + + /// Prefix decrement. + /// \return decremented half value + half& operator--() + { + return *this -= 1.0f; + } + + /// Postfix increment. + /// \return non-incremented half value + half operator++(int) + { + half out(*this); + ++*this; + return out; + } + + /// Postfix decrement. + /// \return non-decremented half value + half operator--(int) + { + half out(*this); + --*this; + return out; + } + +private: + /// Rounding mode to use + static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE); + + /// Constructor. + /// \param bits binary representation to set half to + HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {} + + /// Internal binary representation + detail::uint16 data_; +}; + +#if HALF_ENABLE_CPP11_USER_LITERALS +namespace literal +{ +/// Half literal. +/// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due +/// to rather involved conversions. +/// \param value literal value +/// \return half with given value (if representable) +inline half operator"" _h(long double value) +{ + return half(detail::binary, detail::float2half(value)); +} +} // namespace literal +#endif + +namespace detail +{ +/// Wrapper implementing unspecialized half-precision functions. +struct functions +{ + /// Addition implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision sum stored in single-precision + static expr plus(float x, float y) + { + return expr(x + y); + } + + /// Subtraction implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision difference stored in single-precision + static expr minus(float x, float y) + { + return expr(x - y); + } + + /// Multiplication implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision product stored in single-precision + static expr multiplies(float x, float y) + { + return expr(x * y); + } + + /// Division implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision quotient stored in single-precision + static expr divides(float x, float y) + { + return expr(x / y); + } + + /// Output implementation. + /// \param out stream to write to + /// \param arg value to write + /// \return reference to stream + template + static std::basic_ostream& write(std::basic_ostream& out, float arg) + { + return out << arg; + } + + /// Input implementation. + /// \param in stream to read from + /// \param arg half to read into + /// \return reference to stream + template + static std::basic_istream& read(std::basic_istream& in, half& arg) + { + float f; + if (in >> f) + arg = f; + return in; + } + + /// Modulo implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr fmod(float x, float y) + { + return expr(std::fmod(x, y)); + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr remainder(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::remainder(x, y)); +#else + if (builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + float ax = std::fabs(x), ay = std::fabs(y); + if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if (ay >= 65536.0f) + return expr(x); + if (ax == ay) + return expr(builtin_signbit(x) ? -0.0f : 0.0f); + ax = std::fmod(ax, ay + ay); + float y2 = 0.5f * ay; + if (ax > y2) + { + ax -= ay; + if (ax >= y2) + ax -= ay; + } + return expr(builtin_signbit(x) ? -ax : ax); +#endif + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \param quo address to store quotient bits at + /// \return Half-precision division remainder stored in single-precision + static expr remquo(float x, float y, int* quo) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::remquo(x, y, quo)); +#else + if (builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + bool sign = builtin_signbit(x), qsign = static_cast(sign ^ builtin_signbit(y)); + float ax = std::fabs(x), ay = std::fabs(y); + if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if (ay >= 65536.0f) + return expr(x); + if (ax == ay) + return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f); + ax = std::fmod(ax, 8.0f * ay); + int cquo = 0; + if (ax >= 4.0f * ay) + { + ax -= 4.0f * ay; + cquo += 4; + } + if (ax >= 2.0f * ay) + { + ax -= 2.0f * ay; + cquo += 2; + } + float y2 = 0.5f * ay; + if (ax > y2) + { + ax -= ay; + ++cquo; + if (ax >= y2) + { + ax -= ay; + ++cquo; + } + } + return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax); +#endif + } + + /// Positive difference implementation. + /// \param x first operand + /// \param y second operand + /// \return Positive difference stored in single-precision + static expr fdim(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fdim(x, y)); +#else + return expr((x <= y) ? 0.0f : (x - y)); +#endif + } + + /// Fused multiply-add implementation. + /// \param x first operand + /// \param y second operand + /// \param z third operand + /// \return \a x * \a y + \a z stored in single-precision + static expr fma(float x, float y, float z) + { +#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF) + return expr(std::fma(x, y, z)); +#else + return expr(x * y + z); +#endif + } + + /// Get NaN. + /// \return Half-precision quiet NaN + static half nanh() + { + return half(binary, 0x7FFF); + } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp(float arg) + { + return expr(std::exp(arg)); + } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr expm1(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::expm1(arg)); +#else + return expr(static_cast(std::exp(static_cast(arg)) - 1.0)); +#endif + } + + /// Binary exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp2(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::exp2(arg)); +#else + return expr(static_cast(std::exp(arg * 0.69314718055994530941723212145818))); +#endif + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log(float arg) + { + return expr(std::log(arg)); + } + + /// Common logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log10(float arg) + { + return expr(std::log10(arg)); + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log1p(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::log1p(arg)); +#else + return expr(static_cast(std::log(1.0 + arg))); +#endif + } + + /// Binary logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log2(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::log2(arg)); +#else + return expr(static_cast(std::log(static_cast(arg)) * 1.4426950408889634073599246810019)); +#endif + } + + /// Square root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sqrt(float arg) + { + return expr(std::sqrt(arg)); + } + + /// Cubic root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cbrt(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::cbrt(arg)); +#else + if (builtin_isnan(arg) || builtin_isinf(arg)) + return expr(arg); + return expr(builtin_signbit(arg) ? -static_cast(std::pow(-static_cast(arg), 1.0 / 3.0)) + : static_cast(std::pow(static_cast(arg), 1.0 / 3.0))); +#endif + } + + /// Hypotenuse implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr hypot(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::hypot(x, y)); +#else + return expr((builtin_isinf(x) || builtin_isinf(y)) + ? std::numeric_limits::infinity() + : static_cast(std::sqrt(static_cast(x) * x + static_cast(y) * y))); +#endif + } + + /// Power implementation. + /// \param base value to exponentiate + /// \param exp power to expontiate to + /// \return function value stored in single-preicision + static expr pow(float base, float exp) + { + return expr(std::pow(base, exp)); + } + + /// Sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sin(float arg) + { + return expr(std::sin(arg)); + } + + /// Cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cos(float arg) + { + return expr(std::cos(arg)); + } + + /// Tan implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tan(float arg) + { + return expr(std::tan(arg)); + } + + /// Arc sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asin(float arg) + { + return expr(std::asin(arg)); + } + + /// Arc cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acos(float arg) + { + return expr(std::acos(arg)); + } + + /// Arc tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atan(float arg) + { + return expr(std::atan(arg)); + } + + /// Arc tangent implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr atan2(float x, float y) + { + return expr(std::atan2(x, y)); + } + + /// Hyperbolic sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sinh(float arg) + { + return expr(std::sinh(arg)); + } + + /// Hyperbolic cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cosh(float arg) + { + return expr(std::cosh(arg)); + } + + /// Hyperbolic tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tanh(float arg) + { + return expr(std::tanh(arg)); + } + + /// Hyperbolic area sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asinh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::asinh(arg)); +#else + return expr((arg == -std::numeric_limits::infinity()) + ? arg + : static_cast(std::log(arg + std::sqrt(arg * arg + 1.0)))); +#endif + } + + /// Hyperbolic area cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acosh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::acosh(arg)); +#else + return expr((arg < -1.0f) ? std::numeric_limits::quiet_NaN() + : static_cast(std::log(arg + std::sqrt(arg * arg - 1.0)))); +#endif + } + + /// Hyperbolic area tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atanh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::atanh(arg)); +#else + return expr(static_cast(0.5 * std::log((1.0 + arg) / (1.0 - arg)))); +#endif + } + + /// Error function implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erf(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::erf(arg)); +#else + return expr(static_cast(erf(static_cast(arg)))); +#endif + } + + /// Complementary implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erfc(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::erfc(arg)); +#else + return expr(static_cast(1.0 - erf(static_cast(arg)))); +#endif + } + + /// Gamma logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr lgamma(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::lgamma(arg)); +#else + if (builtin_isinf(arg)) + return expr(std::numeric_limits::infinity()); + if (arg < 0.0f) + { + float i, f = std::modf(-arg, &i); + if (f == 0.0f) + return expr(std::numeric_limits::infinity()); + return expr(static_cast(1.1447298858494001741434273513531 + - std::log(std::abs(std::sin(3.1415926535897932384626433832795 * f))) - lgamma(1.0 - arg))); + } + return expr(static_cast(lgamma(static_cast(arg)))); +#endif + } + + /// Gamma implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tgamma(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::tgamma(arg)); +#else + if (arg == 0.0f) + return builtin_signbit(arg) ? expr(-std::numeric_limits::infinity()) + : expr(std::numeric_limits::infinity()); + if (arg < 0.0f) + { + float i, f = std::modf(-arg, &i); + if (f == 0.0f) + return expr(std::numeric_limits::quiet_NaN()); + double value = 3.1415926535897932384626433832795 + / (std::sin(3.1415926535897932384626433832795 * f) * std::exp(lgamma(1.0 - arg))); + return expr(static_cast((std::fmod(i, 2.0f) == 0.0f) ? -value : value)); + } + if (builtin_isinf(arg)) + return expr(arg); + return expr(static_cast(std::exp(lgamma(static_cast(arg))))); +#endif + } + + /// Floor implementation. + /// \param arg value to round + /// \return rounded value + static half floor(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Ceiling implementation. + /// \param arg value to round + /// \return rounded value + static half ceil(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Truncation implementation. + /// \param arg value to round + /// \return rounded value + static half trunc(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half round(half arg) + { + return half(binary, round_half_up(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lround(half arg) + { + return detail::half2int_up(arg.data_); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half rint(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lrint(half arg) + { + return detail::half2int(arg.data_); + } + +#if HALF_ENABLE_CPP11_LONG_LONG + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llround(half arg) + { + return detail::half2int_up(arg.data_); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llrint(half arg) + { + return detail::half2int(arg.data_); + } +#endif + + /// Decompression implementation. + /// \param arg number to decompress + /// \param exp address to store exponent at + /// \return normalized significant + static half frexp(half arg, int* exp) + { + int m = arg.data_ & 0x7FFF, e = -14; + if (m >= 0x7C00 || !m) + return *exp = 0, arg; + for (; m < 0x400; m <<= 1, --e) + ; + return *exp = e + (m >> 10), half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF)); + } + + /// Decompression implementation. + /// \param arg number to decompress + /// \param iptr address to store integer part at + /// \return fractional part + static half modf(half arg, half* iptr) + { + uint32_t e = arg.data_ & 0x7FFF; + if (e >= 0x6400) + return *iptr = arg, half(binary, arg.data_ & (0x8000U | -(e > 0x7C00))); + if (e < 0x3C00) + return iptr->data_ = arg.data_ & 0x8000, arg; + e >>= 10; + uint32_t mask = (1 << (25 - e)) - 1, m = arg.data_ & mask; + iptr->data_ = arg.data_ & ~mask; + if (!m) + return half(binary, arg.data_ & 0x8000); + for (; m < 0x400; m <<= 1, --e) + ; + return half(binary, static_cast((arg.data_ & 0x8000) | (e << 10) | (m & 0x3FF))); + } + + /// Scaling implementation. + /// \param arg number to scale + /// \param exp power of two to scale by + /// \return scaled number + static half scalbln(half arg, long exp) + { + uint32_t m = arg.data_ & 0x7FFF; + if (m >= 0x7C00 || !m) + return arg; + for (; m < 0x400; m <<= 1, --exp) + ; + exp += m >> 10; + uint16 value = arg.data_ & 0x8000; + if (exp > 30) + { + if (half::round_style == std::round_toward_zero) + value |= 0x7BFF; + else if (half::round_style == std::round_toward_infinity) + value |= 0x7C00 - (value >> 15); + else if (half::round_style == std::round_toward_neg_infinity) + value |= 0x7BFF + (value >> 15); + else + value |= 0x7C00; + } + else if (exp > 0) + value |= (exp << 10) | (m & 0x3FF); + else if (exp > -11) + { + m = (m & 0x3FF) | 0x400; + if (half::round_style == std::round_to_nearest) + { + m += 1 << -exp; +#if HALF_ROUND_TIES_TO_EVEN + m -= (m >> (1 - exp)) & 1; +#endif + } + else if (half::round_style == std::round_toward_infinity) + m += ((value >> 15) - 1) & ((1 << (1 - exp)) - 1U); + else if (half::round_style == std::round_toward_neg_infinity) + m += -(value >> 15) & ((1 << (1 - exp)) - 1U); + value |= m >> (1 - exp); + } + else if (half::round_style == std::round_toward_infinity) + value -= (value >> 15) - 1; + else if (half::round_style == std::round_toward_neg_infinity) + value += value >> 15; + return half(binary, value); + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static int ilogb(half arg) + { + int abs = arg.data_ & 0x7FFF; + if (!abs) + return FP_ILOGB0; + if (abs < 0x7C00) + { + int exp = (abs >> 10) - 15; + if (abs < 0x400) + for (; abs < 0x200; abs <<= 1, --exp) + ; + return exp; + } + if (abs > 0x7C00) + return FP_ILOGBNAN; + return INT_MAX; + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static half logb(half arg) + { + int abs = arg.data_ & 0x7FFF; + if (!abs) + return half(binary, 0xFC00); + if (abs < 0x7C00) + { + int exp = (abs >> 10) - 15; + if (abs < 0x400) + for (; abs < 0x200; abs <<= 1, --exp) + ; + uint16 bits = (exp < 0) << 15; + if (exp) + { + uint32_t m = std::abs(exp) << 6, e = 18; + for (; m < 0x400; m <<= 1, --e) + ; + bits |= (e << 10) + m; + } + return half(binary, bits); + } + if (abs > 0x7C00) + return arg; + return half(binary, 0x7C00); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nextafter(half from, half to) + { + uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF; + if (fabs > 0x7C00) + return from; + if (tabs > 0x7C00 || from.data_ == to.data_ || !(fabs | tabs)) + return to; + if (!fabs) + return half(binary, (to.data_ & 0x8000) + 1); + bool lt = ((fabs == from.data_) ? static_cast(fabs) : -static_cast(fabs)) + < ((tabs == to.data_) ? static_cast(tabs) : -static_cast(tabs)); + return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast(lt)) << 1) - 1); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nexttoward(half from, long double to) + { + if (isnan(from)) + return from; + long double lfrom = static_cast(from); + if (builtin_isnan(to) || lfrom == to) + return half(static_cast(to)); + if (!(from.data_ & 0x7FFF)) + return half(binary, (static_cast(builtin_signbit(to)) << 15) + 1); + return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast(lfrom < to)) << 1) - 1); + } + + /// Sign implementation + /// \param x first operand + /// \param y second operand + /// \return composed value + static half copysign(half x, half y) + { + return half(binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000)); + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static int fpclassify(half arg) + { + uint32_t abs = arg.data_ & 0x7FFF; + return abs + ? ((abs > 0x3FF) ? ((abs >= 0x7C00) ? ((abs > 0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) : FP_SUBNORMAL) + : FP_ZERO; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if finite number + /// \retval false else + static bool isfinite(half arg) + { + return (arg.data_ & 0x7C00) != 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static bool isinf(half arg) + { + return (arg.data_ & 0x7FFF) == 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if not a number + /// \retval false else + static bool isnan(half arg) + { + return (arg.data_ & 0x7FFF) > 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if normal number + /// \retval false else + static bool isnormal(half arg) + { + return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00); + } + + /// Sign bit implementation. + /// \param arg value to check + /// \retval true if signed + /// \retval false if unsigned + static bool signbit(half arg) + { + return (arg.data_ & 0x8000) != 0; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands equal + /// \retval false else + static bool isequal(half x, half y) + { + return (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF)) && !isnan(x); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands not equal + /// \retval false else + static bool isnotequal(half x, half y) + { + return (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF)) || isnan(x); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x > \a y + /// \retval false else + static bool isgreater(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x >= \a y + /// \retval false else + static bool isgreaterequal(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) >= ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x < \a y + /// \retval false else + static bool isless(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x <= \a y + /// \retval false else + static bool islessequal(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) <= ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if either \a x > \a y nor \a x < \a y + /// \retval false else + static bool islessgreater(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00 || yabs > 0x7C00) + return false; + int a = (xabs == x.data_) ? xabs : -xabs, b = (yabs == y.data_) ? yabs : -yabs; + return a < b || a > b; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operand unordered + /// \retval false else + static bool isunordered(half x, half y) + { + return isnan(x) || isnan(y); + } + +private: + static double erf(double arg) + { + if (builtin_isinf(arg)) + return (arg < 0.0) ? -1.0 : 1.0; + double x2 = arg * arg, ax2 = 0.147 * x2, + value = std::sqrt(1.0 - std::exp(-x2 * (1.2732395447351626861510701069801 + ax2) / (1.0 + ax2))); + return builtin_signbit(arg) ? -value : value; + } + + static double lgamma(double arg) + { + double v = 1.0; + for (; arg < 8.0; ++arg) + v *= arg; + double w = 1.0 / (arg * arg); + return (((((((-0.02955065359477124183006535947712 * w + 0.00641025641025641025641025641026) * w + + -0.00191752691752691752691752691753) + * w + + 8.4175084175084175084175084175084e-4) + * w + + -5.952380952380952380952380952381e-4) + * w + + 7.9365079365079365079365079365079e-4) + * w + + -0.00277777777777777777777777777778) + * w + + 0.08333333333333333333333333333333) + / arg + + 0.91893853320467274178032973640562 - std::log(v) - arg + (arg - 0.5) * std::log(arg); + } +}; + +/// Wrapper for unary half-precision functions needing specialization for individual argument types. +/// \tparam T argument type +template +struct unary_specialized +{ + /// Negation implementation. + /// \param arg value to negate + /// \return negated value + static HALF_CONSTEXPR half negate(half arg) + { + return half(binary, arg.data_ ^ 0x8000); + } + + /// Absolute value implementation. + /// \param arg function argument + /// \return absolute value + static half fabs(half arg) + { + return half(binary, arg.data_ & 0x7FFF); + } +}; +template <> +struct unary_specialized +{ + static HALF_CONSTEXPR expr negate(float arg) + { + return expr(-arg); + } + static expr fabs(float arg) + { + return expr(std::fabs(arg)); + } +}; + +/// Wrapper for binary half-precision functions needing specialization for individual argument types. +/// \tparam T first argument type +/// \tparam U first argument type +template +struct binary_specialized +{ + /// Minimum implementation. + /// \param x first operand + /// \param y second operand + /// \return minimum value + static expr fmin(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fmin(x, y)); +#else + if (builtin_isnan(x)) + return expr(y); + if (builtin_isnan(y)) + return expr(x); + return expr(std::min(x, y)); +#endif + } + + /// Maximum implementation. + /// \param x first operand + /// \param y second operand + /// \return maximum value + static expr fmax(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fmax(x, y)); +#else + if (builtin_isnan(x)) + return expr(y); + if (builtin_isnan(y)) + return expr(x); + return expr(std::max(x, y)); +#endif + } +}; +template <> +struct binary_specialized +{ + static half fmin(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00) + return y; + if (yabs > 0x7C00) + return x; + return (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)) ? y : x; + } + static half fmax(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00) + return y; + if (yabs > 0x7C00) + return x; + return (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)) ? y : x; + } +}; + +/// Helper class for half casts. +/// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member +/// function and a corresponding `type` member denoting its return type. +/// \tparam T destination type +/// \tparam U source type +/// \tparam R rounding mode to use +template +struct half_caster +{ +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast from non-arithmetic type unsupported"); +#endif + + static half cast(U arg) + { + return cast_impl(arg, is_float()); + }; + +private: + static half cast_impl(U arg, true_type) + { + return half(binary, float2half(arg)); + } + static half cast_impl(U arg, false_type) + { + return half(binary, int2half(arg)); + } +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); +#endif + + static T cast(half arg) + { + return cast_impl(arg, is_float()); + } + +private: + static T cast_impl(half arg, true_type) + { + return half2float(arg.data_); + } + static T cast_impl(half arg, false_type) + { + return half2int(arg.data_); + } +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); +#endif + + static T cast(expr arg) + { + return cast_impl(arg, is_float()); + } + +private: + static T cast_impl(float arg, true_type) + { + return static_cast(arg); + } + static T cast_impl(half arg, false_type) + { + return half2int(arg.data_); + } +}; +template +struct half_caster +{ + static half cast(half arg) + { + return arg; + } +}; +template +struct half_caster : half_caster +{ +}; + +/// \name Comparison operators +/// \{ + +/// Comparison for equality. +/// \param x first operand +/// \param y second operand +/// \retval true if operands equal +/// \retval false else +template +typename enable::type operator==(T x, U y) +{ + return functions::isequal(x, y); +} + +/// Comparison for inequality. +/// \param x first operand +/// \param y second operand +/// \retval true if operands not equal +/// \retval false else +template +typename enable::type operator!=(T x, U y) +{ + return functions::isnotequal(x, y); +} + +/// Comparison for less than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less than \a y +/// \retval false else +template +typename enable::type operator<(T x, U y) +{ + return functions::isless(x, y); +} + +/// Comparison for greater than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater than \a y +/// \retval false else +template +typename enable::type operator>(T x, U y) +{ + return functions::isgreater(x, y); +} + +/// Comparison for less equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less equal \a y +/// \retval false else +template +typename enable::type operator<=(T x, U y) +{ + return functions::islessequal(x, y); +} + +/// Comparison for greater equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater equal \a y +/// \retval false else +template +typename enable::type operator>=(T x, U y) +{ + return functions::isgreaterequal(x, y); +} + +/// \} +/// \name Arithmetic operators +/// \{ + +/// Add halfs. +/// \param x left operand +/// \param y right operand +/// \return sum of half expressions +template +typename enable::type operator+(T x, U y) +{ + return functions::plus(x, y); +} + +/// Subtract halfs. +/// \param x left operand +/// \param y right operand +/// \return difference of half expressions +template +typename enable::type operator-(T x, U y) +{ + return functions::minus(x, y); +} + +/// Multiply halfs. +/// \param x left operand +/// \param y right operand +/// \return product of half expressions +template +typename enable::type operator*(T x, U y) +{ + return functions::multiplies(x, y); +} + +/// Divide halfs. +/// \param x left operand +/// \param y right operand +/// \return quotient of half expressions +template +typename enable::type operator/(T x, U y) +{ + return functions::divides(x, y); +} + +/// Identity. +/// \param arg operand +/// \return uncahnged operand +template +HALF_CONSTEXPR typename enable::type operator+(T arg) +{ + return arg; +} + +/// Negation. +/// \param arg operand +/// \return negated operand +template +HALF_CONSTEXPR typename enable::type operator-(T arg) +{ + return unary_specialized::negate(arg); +} + +/// \} +/// \name Input and output +/// \{ + +/// Output operator. +/// \param out output stream to write into +/// \param arg half expression to write +/// \return reference to output stream +template +typename enable&, T>::type operator<<(std::basic_ostream& out, T arg) +{ + return functions::write(out, arg); +} + +/// Input operator. +/// \param in input stream to read from +/// \param arg half to read into +/// \return reference to input stream +template +std::basic_istream& operator>>(std::basic_istream& in, half& arg) +{ + return functions::read(in, arg); +} + +/// \} +/// \name Basic mathematical operations +/// \{ + +/// Absolute value. +/// \param arg operand +/// \return absolute value of \a arg +// template typename enable::type abs(T arg) { return unary_specialized::fabs(arg); } +inline half abs(half arg) +{ + return unary_specialized::fabs(arg); +} +inline expr abs(expr arg) +{ + return unary_specialized::fabs(arg); +} + +/// Absolute value. +/// \param arg operand +/// \return absolute value of \a arg +// template typename enable::type fabs(T arg) { return unary_specialized::fabs(arg); } +inline half fabs(half arg) +{ + return unary_specialized::fabs(arg); +} +inline expr fabs(expr arg) +{ + return unary_specialized::fabs(arg); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \return remainder of floating point division. +// template typename enable::type fmod(T x, U y) { return functions::fmod(x, y); } +inline expr fmod(half x, half y) +{ + return functions::fmod(x, y); +} +inline expr fmod(half x, expr y) +{ + return functions::fmod(x, y); +} +inline expr fmod(expr x, half y) +{ + return functions::fmod(x, y); +} +inline expr fmod(expr x, expr y) +{ + return functions::fmod(x, y); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \return remainder of floating point division. +// template typename enable::type remainder(T x, U y) { return +// functions::remainder(x, y); } +inline expr remainder(half x, half y) +{ + return functions::remainder(x, y); +} +inline expr remainder(half x, expr y) +{ + return functions::remainder(x, y); +} +inline expr remainder(expr x, half y) +{ + return functions::remainder(x, y); +} +inline expr remainder(expr x, expr y) +{ + return functions::remainder(x, y); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \param quo address to store some bits of quotient at +/// \return remainder of floating point division. +// template typename enable::type remquo(T x, U y, int *quo) { return +// functions::remquo(x, y, quo); } +inline expr remquo(half x, half y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(half x, expr y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(expr x, half y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(expr x, expr y, int* quo) +{ + return functions::remquo(x, y, quo); +} + +/// Fused multiply add. +/// \param x first operand +/// \param y second operand +/// \param z third operand +/// \return ( \a x * \a y ) + \a z rounded as one operation. +// template typename enable::type fma(T x, U y, V z) { return +// functions::fma(x, y, z); } +inline expr fma(half x, half y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, half y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, expr y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, expr y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, half y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, half y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, expr y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, expr y, expr z) +{ + return functions::fma(x, y, z); +} + +/// Maximum of half expressions. +/// \param x first operand +/// \param y second operand +/// \return maximum of operands +// template typename result::type fmax(T x, U y) { return +// binary_specialized::fmax(x, y); } +inline half fmax(half x, half y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(half x, expr y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(expr x, half y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(expr x, expr y) +{ + return binary_specialized::fmax(x, y); +} + +/// Minimum of half expressions. +/// \param x first operand +/// \param y second operand +/// \return minimum of operands +// template typename result::type fmin(T x, U y) { return +// binary_specialized::fmin(x, y); } +inline half fmin(half x, half y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(half x, expr y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(expr x, half y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(expr x, expr y) +{ + return binary_specialized::fmin(x, y); +} + +/// Positive difference. +/// \param x first operand +/// \param y second operand +/// \return \a x - \a y or 0 if difference negative +// template typename enable::type fdim(T x, U y) { return functions::fdim(x, y); } +inline expr fdim(half x, half y) +{ + return functions::fdim(x, y); +} +inline expr fdim(half x, expr y) +{ + return functions::fdim(x, y); +} +inline expr fdim(expr x, half y) +{ + return functions::fdim(x, y); +} +inline expr fdim(expr x, expr y) +{ + return functions::fdim(x, y); +} + +/// Get NaN value. +/// \return quiet NaN +inline half nanh(const char*) +{ + return functions::nanh(); +} + +/// \} +/// \name Exponential functions +/// \{ + +/// Exponential function. +/// \param arg function argument +/// \return e raised to \a arg +// template typename enable::type exp(T arg) { return functions::exp(arg); } +inline expr exp(half arg) +{ + return functions::exp(arg); +} +inline expr exp(expr arg) +{ + return functions::exp(arg); +} + +/// Exponential minus one. +/// \param arg function argument +/// \return e raised to \a arg subtracted by 1 +// template typename enable::type expm1(T arg) { return functions::expm1(arg); } +inline expr expm1(half arg) +{ + return functions::expm1(arg); +} +inline expr expm1(expr arg) +{ + return functions::expm1(arg); +} + +/// Binary exponential. +/// \param arg function argument +/// \return 2 raised to \a arg +// template typename enable::type exp2(T arg) { return functions::exp2(arg); } +inline expr exp2(half arg) +{ + return functions::exp2(arg); +} +inline expr exp2(expr arg) +{ + return functions::exp2(arg); +} + +/// Natural logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base e +// template typename enable::type log(T arg) { return functions::log(arg); } +inline expr log(half arg) +{ + return functions::log(arg); +} +inline expr log(expr arg) +{ + return functions::log(arg); +} + +/// Common logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base 10 +// template typename enable::type log10(T arg) { return functions::log10(arg); } +inline expr log10(half arg) +{ + return functions::log10(arg); +} +inline expr log10(expr arg) +{ + return functions::log10(arg); +} + +/// Natural logorithm. +/// \param arg function argument +/// \return logarithm of \a arg plus 1 to base e +// template typename enable::type log1p(T arg) { return functions::log1p(arg); } +inline expr log1p(half arg) +{ + return functions::log1p(arg); +} +inline expr log1p(expr arg) +{ + return functions::log1p(arg); +} + +/// Binary logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base 2 +// template typename enable::type log2(T arg) { return functions::log2(arg); } +inline expr log2(half arg) +{ + return functions::log2(arg); +} +inline expr log2(expr arg) +{ + return functions::log2(arg); +} + +/// \} +/// \name Power functions +/// \{ + +/// Square root. +/// \param arg function argument +/// \return square root of \a arg +// template typename enable::type sqrt(T arg) { return functions::sqrt(arg); } +inline expr sqrt(half arg) +{ + return functions::sqrt(arg); +} +inline expr sqrt(expr arg) +{ + return functions::sqrt(arg); +} + +/// Cubic root. +/// \param arg function argument +/// \return cubic root of \a arg +// template typename enable::type cbrt(T arg) { return functions::cbrt(arg); } +inline expr cbrt(half arg) +{ + return functions::cbrt(arg); +} +inline expr cbrt(expr arg) +{ + return functions::cbrt(arg); +} + +/// Hypotenuse function. +/// \param x first argument +/// \param y second argument +/// \return square root of sum of squares without internal over- or underflows +// template typename enable::type hypot(T x, U y) { return functions::hypot(x, y); +//} +inline expr hypot(half x, half y) +{ + return functions::hypot(x, y); +} +inline expr hypot(half x, expr y) +{ + return functions::hypot(x, y); +} +inline expr hypot(expr x, half y) +{ + return functions::hypot(x, y); +} +inline expr hypot(expr x, expr y) +{ + return functions::hypot(x, y); +} + +/// Power function. +/// \param base first argument +/// \param exp second argument +/// \return \a base raised to \a exp +// template typename enable::type pow(T base, U exp) { return functions::pow(base, +// exp); } +inline expr pow(half base, half exp) +{ + return functions::pow(base, exp); +} +inline expr pow(half base, expr exp) +{ + return functions::pow(base, exp); +} +inline expr pow(expr base, half exp) +{ + return functions::pow(base, exp); +} +inline expr pow(expr base, expr exp) +{ + return functions::pow(base, exp); +} + +/// \} +/// \name Trigonometric functions +/// \{ + +/// Sine function. +/// \param arg function argument +/// \return sine value of \a arg +// template typename enable::type sin(T arg) { return functions::sin(arg); } +inline expr sin(half arg) +{ + return functions::sin(arg); +} +inline expr sin(expr arg) +{ + return functions::sin(arg); +} + +/// Cosine function. +/// \param arg function argument +/// \return cosine value of \a arg +// template typename enable::type cos(T arg) { return functions::cos(arg); } +inline expr cos(half arg) +{ + return functions::cos(arg); +} +inline expr cos(expr arg) +{ + return functions::cos(arg); +} + +/// Tangent function. +/// \param arg function argument +/// \return tangent value of \a arg +// template typename enable::type tan(T arg) { return functions::tan(arg); } +inline expr tan(half arg) +{ + return functions::tan(arg); +} +inline expr tan(expr arg) +{ + return functions::tan(arg); +} + +/// Arc sine. +/// \param arg function argument +/// \return arc sine value of \a arg +// template typename enable::type asin(T arg) { return functions::asin(arg); } +inline expr asin(half arg) +{ + return functions::asin(arg); +} +inline expr asin(expr arg) +{ + return functions::asin(arg); +} + +/// Arc cosine function. +/// \param arg function argument +/// \return arc cosine value of \a arg +// template typename enable::type acos(T arg) { return functions::acos(arg); } +inline expr acos(half arg) +{ + return functions::acos(arg); +} +inline expr acos(expr arg) +{ + return functions::acos(arg); +} + +/// Arc tangent function. +/// \param arg function argument +/// \return arc tangent value of \a arg +// template typename enable::type atan(T arg) { return functions::atan(arg); } +inline expr atan(half arg) +{ + return functions::atan(arg); +} +inline expr atan(expr arg) +{ + return functions::atan(arg); +} + +/// Arc tangent function. +/// \param x first argument +/// \param y second argument +/// \return arc tangent value +// template typename enable::type atan2(T x, U y) { return functions::atan2(x, y); +//} +inline expr atan2(half x, half y) +{ + return functions::atan2(x, y); +} +inline expr atan2(half x, expr y) +{ + return functions::atan2(x, y); +} +inline expr atan2(expr x, half y) +{ + return functions::atan2(x, y); +} +inline expr atan2(expr x, expr y) +{ + return functions::atan2(x, y); +} + +/// \} +/// \name Hyperbolic functions +/// \{ + +/// Hyperbolic sine. +/// \param arg function argument +/// \return hyperbolic sine value of \a arg +// template typename enable::type sinh(T arg) { return functions::sinh(arg); } +inline expr sinh(half arg) +{ + return functions::sinh(arg); +} +inline expr sinh(expr arg) +{ + return functions::sinh(arg); +} + +/// Hyperbolic cosine. +/// \param arg function argument +/// \return hyperbolic cosine value of \a arg +// template typename enable::type cosh(T arg) { return functions::cosh(arg); } +inline expr cosh(half arg) +{ + return functions::cosh(arg); +} +inline expr cosh(expr arg) +{ + return functions::cosh(arg); +} + +/// Hyperbolic tangent. +/// \param arg function argument +/// \return hyperbolic tangent value of \a arg +// template typename enable::type tanh(T arg) { return functions::tanh(arg); } +inline expr tanh(half arg) +{ + return functions::tanh(arg); +} +inline expr tanh(expr arg) +{ + return functions::tanh(arg); +} + +/// Hyperbolic area sine. +/// \param arg function argument +/// \return area sine value of \a arg +// template typename enable::type asinh(T arg) { return functions::asinh(arg); } +inline expr asinh(half arg) +{ + return functions::asinh(arg); +} +inline expr asinh(expr arg) +{ + return functions::asinh(arg); +} + +/// Hyperbolic area cosine. +/// \param arg function argument +/// \return area cosine value of \a arg +// template typename enable::type acosh(T arg) { return functions::acosh(arg); } +inline expr acosh(half arg) +{ + return functions::acosh(arg); +} +inline expr acosh(expr arg) +{ + return functions::acosh(arg); +} + +/// Hyperbolic area tangent. +/// \param arg function argument +/// \return area tangent value of \a arg +// template typename enable::type atanh(T arg) { return functions::atanh(arg); } +inline expr atanh(half arg) +{ + return functions::atanh(arg); +} +inline expr atanh(expr arg) +{ + return functions::atanh(arg); +} + +/// \} +/// \name Error and gamma functions +/// \{ + +/// Error function. +/// \param arg function argument +/// \return error function value of \a arg +// template typename enable::type erf(T arg) { return functions::erf(arg); } +inline expr erf(half arg) +{ + return functions::erf(arg); +} +inline expr erf(expr arg) +{ + return functions::erf(arg); +} + +/// Complementary error function. +/// \param arg function argument +/// \return 1 minus error function value of \a arg +// template typename enable::type erfc(T arg) { return functions::erfc(arg); } +inline expr erfc(half arg) +{ + return functions::erfc(arg); +} +inline expr erfc(expr arg) +{ + return functions::erfc(arg); +} + +/// Natural logarithm of gamma function. +/// \param arg function argument +/// \return natural logarith of gamma function for \a arg +// template typename enable::type lgamma(T arg) { return functions::lgamma(arg); } +inline expr lgamma(half arg) +{ + return functions::lgamma(arg); +} +inline expr lgamma(expr arg) +{ + return functions::lgamma(arg); +} + +/// Gamma function. +/// \param arg function argument +/// \return gamma function value of \a arg +// template typename enable::type tgamma(T arg) { return functions::tgamma(arg); } +inline expr tgamma(half arg) +{ + return functions::tgamma(arg); +} +inline expr tgamma(expr arg) +{ + return functions::tgamma(arg); +} + +/// \} +/// \name Rounding +/// \{ + +/// Nearest integer not less than half value. +/// \param arg half to round +/// \return nearest integer not less than \a arg +// template typename enable::type ceil(T arg) { return functions::ceil(arg); } +inline half ceil(half arg) +{ + return functions::ceil(arg); +} +inline half ceil(expr arg) +{ + return functions::ceil(arg); +} + +/// Nearest integer not greater than half value. +/// \param arg half to round +/// \return nearest integer not greater than \a arg +// template typename enable::type floor(T arg) { return functions::floor(arg); } +inline half floor(half arg) +{ + return functions::floor(arg); +} +inline half floor(expr arg) +{ + return functions::floor(arg); +} + +/// Nearest integer not greater in magnitude than half value. +/// \param arg half to round +/// \return nearest integer not greater in magnitude than \a arg +// template typename enable::type trunc(T arg) { return functions::trunc(arg); } +inline half trunc(half arg) +{ + return functions::trunc(arg); +} +inline half trunc(expr arg) +{ + return functions::trunc(arg); +} + +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type round(T arg) { return functions::round(arg); } +inline half round(half arg) +{ + return functions::round(arg); +} +inline half round(expr arg) +{ + return functions::round(arg); +} + +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type lround(T arg) { return functions::lround(arg); } +inline long lround(half arg) +{ + return functions::lround(arg); +} +inline long lround(expr arg) +{ + return functions::lround(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type nearbyint(T arg) { return functions::nearbyint(arg); } +inline half nearbyint(half arg) +{ + return functions::rint(arg); +} +inline half nearbyint(expr arg) +{ + return functions::rint(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type rint(T arg) { return functions::rint(arg); } +inline half rint(half arg) +{ + return functions::rint(arg); +} +inline half rint(expr arg) +{ + return functions::rint(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type lrint(T arg) { return functions::lrint(arg); } +inline long lrint(half arg) +{ + return functions::lrint(arg); +} +inline long lrint(expr arg) +{ + return functions::lrint(arg); +} +#if HALF_ENABLE_CPP11_LONG_LONG +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type llround(T arg) { return functions::llround(arg); } +inline long long llround(half arg) +{ + return functions::llround(arg); +} +inline long long llround(expr arg) +{ + return functions::llround(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type llrint(T arg) { return functions::llrint(arg); } +inline long long llrint(half arg) +{ + return functions::llrint(arg); +} +inline long long llrint(expr arg) +{ + return functions::llrint(arg); +} +#endif + +/// \} +/// \name Floating point manipulation +/// \{ + +/// Decompress floating point number. +/// \param arg number to decompress +/// \param exp address to store exponent at +/// \return significant in range [0.5, 1) +// template typename enable::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); } +inline half frexp(half arg, int* exp) +{ + return functions::frexp(arg, exp); +} +inline half frexp(expr arg, int* exp) +{ + return functions::frexp(arg, exp); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); +//} +inline half ldexp(half arg, int exp) +{ + return functions::scalbln(arg, exp); +} +inline half ldexp(expr arg, int exp) +{ + return functions::scalbln(arg, exp); +} + +/// Extract integer and fractional parts. +/// \param arg number to decompress +/// \param iptr address to store integer part at +/// \return fractional part +// template typename enable::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); +//} +inline half modf(half arg, half* iptr) +{ + return functions::modf(arg, iptr); +} +inline half modf(expr arg, half* iptr) +{ + return functions::modf(arg, iptr); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); +//} +inline half scalbn(half arg, int exp) +{ + return functions::scalbln(arg, exp); +} +inline half scalbn(expr arg, int exp) +{ + return functions::scalbln(arg, exp); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbln(T arg, long exp) { return functions::scalbln(arg, +// exp); +//} +inline half scalbln(half arg, long exp) +{ + return functions::scalbln(arg, exp); +} +inline half scalbln(expr arg, long exp) +{ + return functions::scalbln(arg, exp); +} + +/// Extract exponent. +/// \param arg number to query +/// \return floating point exponent +/// \retval FP_ILOGB0 for zero +/// \retval FP_ILOGBNAN for NaN +/// \retval MAX_INT for infinity +// template typename enable::type ilogb(T arg) { return functions::ilogb(arg); } +inline int ilogb(half arg) +{ + return functions::ilogb(arg); +} +inline int ilogb(expr arg) +{ + return functions::ilogb(arg); +} + +/// Extract exponent. +/// \param arg number to query +/// \return floating point exponent +// template typename enable::type logb(T arg) { return functions::logb(arg); } +inline half logb(half arg) +{ + return functions::logb(arg); +} +inline half logb(expr arg) +{ + return functions::logb(arg); +} + +/// Next representable value. +/// \param from value to compute next representable value for +/// \param to direction towards which to compute next value +/// \return next representable value after \a from in direction towards \a to +// template typename enable::type nextafter(T from, U to) { return +// functions::nextafter(from, to); } +inline half nextafter(half from, half to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(half from, expr to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(expr from, half to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(expr from, expr to) +{ + return functions::nextafter(from, to); +} + +/// Next representable value. +/// \param from value to compute next representable value for +/// \param to direction towards which to compute next value +/// \return next representable value after \a from in direction towards \a to +// template typename enable::type nexttoward(T from, long double to) { return +// functions::nexttoward(from, to); } +inline half nexttoward(half from, long double to) +{ + return functions::nexttoward(from, to); +} +inline half nexttoward(expr from, long double to) +{ + return functions::nexttoward(from, to); +} + +/// Take sign. +/// \param x value to change sign for +/// \param y value to take sign from +/// \return value equal to \a x in magnitude and to \a y in sign +// template typename enable::type copysign(T x, U y) { return +// functions::copysign(x, y); } +inline half copysign(half x, half y) +{ + return functions::copysign(x, y); +} +inline half copysign(half x, expr y) +{ + return functions::copysign(x, y); +} +inline half copysign(expr x, half y) +{ + return functions::copysign(x, y); +} +inline half copysign(expr x, expr y) +{ + return functions::copysign(x, y); +} + +/// \} +/// \name Floating point classification +/// \{ + +/// Classify floating point value. +/// \param arg number to classify +/// \retval FP_ZERO for positive and negative zero +/// \retval FP_SUBNORMAL for subnormal numbers +/// \retval FP_INFINITY for positive and negative infinity +/// \retval FP_NAN for NaNs +/// \retval FP_NORMAL for all other (normal) values +// template typename enable::type fpclassify(T arg) { return functions::fpclassify(arg); } +inline int fpclassify(half arg) +{ + return functions::fpclassify(arg); +} +inline int fpclassify(expr arg) +{ + return functions::fpclassify(arg); +} + +/// Check if finite number. +/// \param arg number to check +/// \retval true if neither infinity nor NaN +/// \retval false else +// template typename enable::type isfinite(T arg) { return functions::isfinite(arg); } +inline bool isfinite(half arg) +{ + return functions::isfinite(arg); +} +inline bool isfinite(expr arg) +{ + return functions::isfinite(arg); +} + +/// Check for infinity. +/// \param arg number to check +/// \retval true for positive or negative infinity +/// \retval false else +// template typename enable::type isinf(T arg) { return functions::isinf(arg); } +inline bool isinf(half arg) +{ + return functions::isinf(arg); +} +inline bool isinf(expr arg) +{ + return functions::isinf(arg); +} + +/// Check for NaN. +/// \param arg number to check +/// \retval true for NaNs +/// \retval false else +// template typename enable::type isnan(T arg) { return functions::isnan(arg); } +inline bool isnan(half arg) +{ + return functions::isnan(arg); +} +inline bool isnan(expr arg) +{ + return functions::isnan(arg); +} + +/// Check if normal number. +/// \param arg number to check +/// \retval true if normal number +/// \retval false if either subnormal, zero, infinity or NaN +// template typename enable::type isnormal(T arg) { return functions::isnormal(arg); } +inline bool isnormal(half arg) +{ + return functions::isnormal(arg); +} +inline bool isnormal(expr arg) +{ + return functions::isnormal(arg); +} + +/// Check sign. +/// \param arg number to check +/// \retval true for negative number +/// \retval false for positive number +// template typename enable::type signbit(T arg) { return functions::signbit(arg); } +inline bool signbit(half arg) +{ + return functions::signbit(arg); +} +inline bool signbit(expr arg) +{ + return functions::signbit(arg); +} + +/// \} +/// \name Comparison +/// \{ + +/// Comparison for greater than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater than \a y +/// \retval false else +// template typename enable::type isgreater(T x, U y) { return +// functions::isgreater(x, y); } +inline bool isgreater(half x, half y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(half x, expr y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(expr x, half y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(expr x, expr y) +{ + return functions::isgreater(x, y); +} + +/// Comparison for greater equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater equal \a y +/// \retval false else +// template typename enable::type isgreaterequal(T x, U y) { return +// functions::isgreaterequal(x, y); } +inline bool isgreaterequal(half x, half y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(half x, expr y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(expr x, half y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(expr x, expr y) +{ + return functions::isgreaterequal(x, y); +} + +/// Comparison for less than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less than \a y +/// \retval false else +// template typename enable::type isless(T x, U y) { return functions::isless(x, +// y); +//} +inline bool isless(half x, half y) +{ + return functions::isless(x, y); +} +inline bool isless(half x, expr y) +{ + return functions::isless(x, y); +} +inline bool isless(expr x, half y) +{ + return functions::isless(x, y); +} +inline bool isless(expr x, expr y) +{ + return functions::isless(x, y); +} + +/// Comparison for less equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less equal \a y +/// \retval false else +// template typename enable::type islessequal(T x, U y) { return +// functions::islessequal(x, y); } +inline bool islessequal(half x, half y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(half x, expr y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(expr x, half y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(expr x, expr y) +{ + return functions::islessequal(x, y); +} + +/// Comarison for less or greater. +/// \param x first operand +/// \param y second operand +/// \retval true if either less or greater +/// \retval false else +// template typename enable::type islessgreater(T x, U y) { return +// functions::islessgreater(x, y); } +inline bool islessgreater(half x, half y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(half x, expr y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(expr x, half y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(expr x, expr y) +{ + return functions::islessgreater(x, y); +} + +/// Check if unordered. +/// \param x first operand +/// \param y second operand +/// \retval true if unordered (one or two NaN operands) +/// \retval false else +// template typename enable::type isunordered(T x, U y) { return +// functions::isunordered(x, y); } +inline bool isunordered(half x, half y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(half x, expr y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(expr x, half y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(expr x, expr y) +{ + return functions::isunordered(x, y); +} + +/// \name Casting +/// \{ + +/// Cast to or from half-precision floating point number. +/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted +/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. +/// It uses the default rounding mode. +/// +/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types +/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler +/// error and casting between [half](\ref half_float::half)s is just a no-op. +/// \tparam T destination type (half or built-in arithmetic type) +/// \tparam U source type (half or built-in arithmetic type) +/// \param arg value to cast +/// \return \a arg converted to destination type +template +T half_cast(U arg) +{ + return half_caster::cast(arg); +} + +/// Cast to or from half-precision floating point number. +/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted +/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. +/// +/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types +/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler +/// error and casting between [half](\ref half_float::half)s is just a no-op. +/// \tparam T destination type (half or built-in arithmetic type) +/// \tparam R rounding mode to use. +/// \tparam U source type (half or built-in arithmetic type) +/// \param arg value to cast +/// \return \a arg converted to destination type +template +T half_cast(U arg) +{ + return half_caster::cast(arg); +} +/// \} +} // namespace detail + +using detail::operator==; +using detail::operator!=; +using detail::operator<; +using detail::operator>; +using detail::operator<=; +using detail::operator>=; +using detail::operator+; +using detail::operator-; +using detail::operator*; +using detail::operator/; +using detail::operator<<; +using detail::operator>>; + +using detail::abs; +using detail::acos; +using detail::acosh; +using detail::asin; +using detail::asinh; +using detail::atan; +using detail::atan2; +using detail::atanh; +using detail::cbrt; +using detail::ceil; +using detail::cos; +using detail::cosh; +using detail::erf; +using detail::erfc; +using detail::exp; +using detail::exp2; +using detail::expm1; +using detail::fabs; +using detail::fdim; +using detail::floor; +using detail::fma; +using detail::fmax; +using detail::fmin; +using detail::fmod; +using detail::hypot; +using detail::lgamma; +using detail::log; +using detail::log10; +using detail::log1p; +using detail::log2; +using detail::lrint; +using detail::lround; +using detail::nanh; +using detail::nearbyint; +using detail::pow; +using detail::remainder; +using detail::remquo; +using detail::rint; +using detail::round; +using detail::sin; +using detail::sinh; +using detail::sqrt; +using detail::tan; +using detail::tanh; +using detail::tgamma; +using detail::trunc; +#if HALF_ENABLE_CPP11_LONG_LONG +using detail::llrint; +using detail::llround; +#endif +using detail::copysign; +using detail::fpclassify; +using detail::frexp; +using detail::ilogb; +using detail::isfinite; +using detail::isgreater; +using detail::isgreaterequal; +using detail::isinf; +using detail::isless; +using detail::islessequal; +using detail::islessgreater; +using detail::isnan; +using detail::isnormal; +using detail::isunordered; +using detail::ldexp; +using detail::logb; +using detail::modf; +using detail::nextafter; +using detail::nexttoward; +using detail::scalbln; +using detail::scalbn; +using detail::signbit; + +using detail::half_cast; +} // namespace half_float + +/// Extensions to the C++ standard library. +namespace std +{ +/// Numeric limits for half-precision floats. +/// Because of the underlying single-precision implementation of many operations, it inherits some properties from +/// `std::numeric_limits`. +template <> +class numeric_limits : public numeric_limits +{ +public: + /// Supports signed values. + static HALF_CONSTEXPR_CONST bool is_signed = true; + + /// Is not exact. + static HALF_CONSTEXPR_CONST bool is_exact = false; + + /// Doesn't provide modulo arithmetic. + static HALF_CONSTEXPR_CONST bool is_modulo = false; + + /// IEEE conformant. + static HALF_CONSTEXPR_CONST bool is_iec559 = true; + + /// Supports infinity. + static HALF_CONSTEXPR_CONST bool has_infinity = true; + + /// Supports quiet NaNs. + static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true; + + /// Supports subnormal values. + static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present; + + /// Rounding mode. + /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying + /// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding + /// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the + /// single-precision rounding mode. + static HALF_CONSTEXPR_CONST float_round_style round_style + = (std::numeric_limits::round_style == half_float::half::round_style) ? half_float::half::round_style + : round_indeterminate; + + /// Significant digits. + static HALF_CONSTEXPR_CONST int digits = 11; + + /// Significant decimal digits. + static HALF_CONSTEXPR_CONST int digits10 = 3; + + /// Required decimal digits to represent all possible values. + static HALF_CONSTEXPR_CONST int max_digits10 = 5; + + /// Number base. + static HALF_CONSTEXPR_CONST int radix = 2; + + /// One more than smallest exponent. + static HALF_CONSTEXPR_CONST int min_exponent = -13; + + /// Smallest normalized representable power of 10. + static HALF_CONSTEXPR_CONST int min_exponent10 = -4; + + /// One more than largest exponent + static HALF_CONSTEXPR_CONST int max_exponent = 16; + + /// Largest finitely representable power of 10. + static HALF_CONSTEXPR_CONST int max_exponent10 = 4; + + /// Smallest positive normal value. + static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x0400); + } + + /// Smallest finite value. + static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0xFBFF); + } + + /// Largest finite value. + static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7BFF); + } + + /// Difference between one and next representable value. + static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x1400); + } + + /// Maximum rounding error. + static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00); + } + + /// Positive infinity. + static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7C00); + } + + /// Quiet NaN. + static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7FFF); + } + + /// Signalling NaN. + static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7DFF); + } + + /// Smallest positive subnormal value. + static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x0001); + } +}; + +#if HALF_ENABLE_CPP11_HASH +/// Hash function for half-precision floats. +/// This is only defined if C++11 `std::hash` is supported and enabled. +template <> +struct hash //: unary_function +{ + /// Type of function argument. + typedef half_float::half argument_type; + + /// Function return type. + typedef size_t result_type; + + /// Compute hash function. + /// \param arg half to hash + /// \return hash value + result_type operator()(argument_type arg) const + { + return hash()(static_cast(arg.data_) & -(arg.data_ != 0x8000)); + } +}; +#endif +} // namespace std + +#undef HALF_CONSTEXPR +#undef HALF_CONSTEXPR_CONST +#undef HALF_NOEXCEPT +#undef HALF_NOTHROW +#ifdef HALF_POP_WARNINGS +#pragma warning(pop) +#undef HALF_POP_WARNINGS +#endif + +#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp b/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp new file mode 100644 index 00000000..03c64398 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "logger.h" +#include "ErrorRecorder.h" +#include "logging.h" + +SampleErrorRecorder gRecorder; +namespace sample +{ +Logger gLogger{Logger::Severity::kINFO}; +LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLogger)}; +LogStreamConsumer gLogInfo{LOG_INFO(gLogger)}; +LogStreamConsumer gLogWarning{LOG_WARN(gLogger)}; +LogStreamConsumer gLogError{LOG_ERROR(gLogger)}; +LogStreamConsumer gLogFatal{LOG_FATAL(gLogger)}; + +void setReportableSeverity(Logger::Severity severity) +{ + gLogger.setReportableSeverity(severity); + gLogVerbose.setReportableSeverity(severity); + gLogInfo.setReportableSeverity(severity); + gLogWarning.setReportableSeverity(severity); + gLogError.setReportableSeverity(severity); + gLogFatal.setReportableSeverity(severity); +} +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logger.h b/src/Detector/tensorrt_yolo/common_deprecated/logger.h new file mode 100644 index 00000000..3069e8e9 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/logger.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LOGGER_H +#define LOGGER_H + +#include "logging.h" + +class SampleErrorRecorder; +extern SampleErrorRecorder gRecorder; +namespace sample +{ +extern Logger gLogger; +extern LogStreamConsumer gLogVerbose; +extern LogStreamConsumer gLogInfo; +extern LogStreamConsumer gLogWarning; +extern LogStreamConsumer gLogError; +extern LogStreamConsumer gLogFatal; + +void setReportableSeverity(Logger::Severity severity); +} // namespace sample + +#endif // LOGGER_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logging.h b/src/Detector/tensorrt_yolo/common_deprecated/logging.h new file mode 100644 index 00000000..78732c10 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/logging.h @@ -0,0 +1,578 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_LOGGING_H +#define TENSORRT_LOGGING_H + +#include "NvInferRuntimeCommon.h" +#include "sampleOptions.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace sample +{ + +using Severity = nvinfer1::ILogger::Severity; + +class LogStreamConsumerBuffer : public std::stringbuf +{ +public: + LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mOutput(stream) + , mPrefix(prefix) + , mShouldLog(shouldLog) + { + } + + LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept + : mOutput(other.mOutput) + , mPrefix(other.mPrefix) + , mShouldLog(other.mShouldLog) + { + } + LogStreamConsumerBuffer(const LogStreamConsumerBuffer& other) = delete; + LogStreamConsumerBuffer() = delete; + LogStreamConsumerBuffer& operator=(const LogStreamConsumerBuffer&) = delete; + LogStreamConsumerBuffer& operator=(LogStreamConsumerBuffer&&) = delete; + + ~LogStreamConsumerBuffer() override + { + // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence + // std::streambuf::pptr() gives a pointer to the current position of the output sequence + // if the pointer to the beginning is not equal to the pointer to the current position, + // call putOutput() to log the output to the stream + if (pbase() != pptr()) + { + putOutput(); + } + } + + //! + //! synchronizes the stream buffer and returns 0 on success + //! synchronizing the stream buffer consists of inserting the buffer contents into the stream, + //! resetting the buffer and flushing the stream + //! + int32_t sync() override + { + putOutput(); + return 0; + } + + void putOutput() + { + if (mShouldLog) + { + // prepend timestamp + std::time_t timestamp = std::time(nullptr); + tm* tm_local = std::localtime(×tamp); + mOutput << "["; + mOutput << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; + mOutput << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; + // std::stringbuf::str() gets the string contents of the buffer + // insert the buffer contents pre-appended by the appropriate prefix into the stream + mOutput << mPrefix << str(); + } + // set the buffer to empty + str(""); + // flush the stream + mOutput.flush(); + } + + void setShouldLog(bool shouldLog) + { + mShouldLog = shouldLog; + } + +private: + std::ostream& mOutput; + std::string mPrefix; + bool mShouldLog{}; +}; // class LogStreamConsumerBuffer + +//! +//! \class LogStreamConsumerBase +//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer +//! +class LogStreamConsumerBase +{ +public: + LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mBuffer(stream, prefix, shouldLog) + { + } + +protected: + std::mutex mLogMutex; + LogStreamConsumerBuffer mBuffer; +}; // class LogStreamConsumerBase + +//! +//! \class LogStreamConsumer +//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. +//! Order of base classes is LogStreamConsumerBase and then std::ostream. +//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field +//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. +//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. +//! Please do not change the order of the parent classes. +//! +class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream +{ +public: + //! + //! \brief Creates a LogStreamConsumer which logs messages with level severity. + //! Reportable severity determines if the messages are severe enough to be logged. + //! + LogStreamConsumer(nvinfer1::ILogger::Severity reportableSeverity, nvinfer1::ILogger::Severity severity) + : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(severity <= reportableSeverity) + , mSeverity(severity) + { + } + + LogStreamConsumer(LogStreamConsumer&& other) noexcept + : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(other.mShouldLog) + , mSeverity(other.mSeverity) + { + } + LogStreamConsumer(const LogStreamConsumer& other) = delete; + LogStreamConsumer() = delete; + ~LogStreamConsumer() = default; + LogStreamConsumer& operator=(const LogStreamConsumer&) = delete; + LogStreamConsumer& operator=(LogStreamConsumer&&) = delete; + + void setReportableSeverity(Severity reportableSeverity) + { + mShouldLog = mSeverity <= reportableSeverity; + mBuffer.setShouldLog(mShouldLog); + } + + std::mutex& getMutex() + { + return mLogMutex; + } + + bool getShouldLog() const + { + return mShouldLog; + } + +private: + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + static std::string severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + bool mShouldLog; + Severity mSeverity; +}; // class LogStreamConsumer + +template +LogStreamConsumer& operator<<(LogStreamConsumer& logger, const T& obj) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + os << obj; + } + return logger; +} + +//! +//! Special handling std::endl +//! +inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, std::ostream& (*f)(std::ostream&) ) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + os << f; + } + return logger; +} + +inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, const nvinfer1::Dims& dims) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + for (int32_t i = 0; i < dims.nbDims; ++i) + { + os << (i ? "x" : "") << dims.d[i]; + } + } + return logger; +} + +//! +//! \class Logger +//! +//! \brief Class which manages logging of TensorRT tools and samples +//! +//! \details This class provides a common interface for TensorRT tools and samples to log information to the console, +//! and supports logging two types of messages: +//! +//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) +//! - Test pass/fail messages +//! +//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is +//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. +//! +//! In the future, this class could be extended to support dumping test results to a file in some standard format +//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). +//! +//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger +//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT +//! library and messages coming from the sample. +//! +//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the +//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger +//! object. +//! +class Logger : public nvinfer1::ILogger +{ +public: + explicit Logger(Severity severity = Severity::kWARNING) + : mReportableSeverity(severity) + { + } + + //! + //! \enum TestResult + //! \brief Represents the state of a given test + //! + enum class TestResult + { + kRUNNING, //!< The test is running + kPASSED, //!< The test passed + kFAILED, //!< The test failed + kWAIVED //!< The test was waived + }; + + //! + //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger + //! \return The nvinfer1::ILogger associated with this Logger + //! + //! TODO Once all samples are updated to use this method to register the logger with TensorRT, + //! we can eliminate the inheritance of Logger from ILogger + //! + nvinfer1::ILogger& getTRTLogger() noexcept + { + return *this; + } + + //! + //! \brief Implementation of the nvinfer1::ILogger::log() virtual method + //! + //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the + //! inheritance from nvinfer1::ILogger + //! + void log(Severity severity, const char* msg) noexcept override + { + LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; + } + + //! + //! \brief Method for controlling the verbosity of logging output + //! + //! \param severity The logger will only emit messages that have severity of this level or higher. + //! + void setReportableSeverity(Severity severity) noexcept + { + mReportableSeverity = severity; + } + + //! + //! \brief Opaque handle that holds logging information for a particular test + //! + //! This object is an opaque handle to information used by the Logger to print test results. + //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used + //! with Logger::reportTest{Start,End}(). + //! + class TestAtom + { + public: + TestAtom(TestAtom&&) = default; + + private: + friend class Logger; + + TestAtom(bool started, const std::string& name, const std::string& cmdline) + : mStarted(started) + , mName(name) + , mCmdline(cmdline) + { + } + + bool mStarted; + std::string mName; + std::string mCmdline; + }; + + //! + //! \brief Define a test for logging + //! + //! \param[in] name The name of the test. This should be a string starting with + //! "TensorRT" and containing dot-separated strings containing + //! the characters [A-Za-z0-9_]. + //! For example, "TensorRT.sample_googlenet" + //! \param[in] cmdline The command line used to reproduce the test + // + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, const std::string& cmdline) + { + return TestAtom(false, name, cmdline); + } + + //! + //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments + //! as input + //! + //! \param[in] name The name of the test + //! \param[in] argc The number of command-line arguments + //! \param[in] argv The array of command-line arguments (given as C strings) + //! + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv) + { + // Append TensorRT version as info + const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]"; + auto cmdline = genCmdlineString(argc, argv); + return defineTest(vname, cmdline); + } + + //! + //! \brief Report that a test has started. + //! + //! \pre reportTestStart() has not been called yet for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has started + //! + static void reportTestStart(TestAtom& testAtom) + { + reportTestResult(testAtom, TestResult::kRUNNING); + assert(!testAtom.mStarted); + testAtom.mStarted = true; + } + + //! + //! \brief Report that a test has ended. + //! + //! \pre reportTestStart() has been called for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has ended + //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, + //! TestResult::kFAILED, TestResult::kWAIVED + //! + static void reportTestEnd(TestAtom const& testAtom, TestResult result) + { + assert(result != TestResult::kRUNNING); + assert(testAtom.mStarted); + reportTestResult(testAtom, result); + } + + static int32_t reportPass(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kPASSED); + return EXIT_SUCCESS; + } + + static int32_t reportFail(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kFAILED); + return EXIT_FAILURE; + } + + static int32_t reportWaive(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kWAIVED); + return EXIT_SUCCESS; + } + + static int32_t reportTest(TestAtom const& testAtom, bool pass) + { + return pass ? reportPass(testAtom) : reportFail(testAtom); + } + + Severity getReportableSeverity() const + { + return mReportableSeverity; + } + +private: + //! + //! \brief returns an appropriate string for prefixing a log message with the given severity + //! + static const char* severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate string for prefixing a test result message with the given result + //! + static const char* testResultString(TestResult result) + { + switch (result) + { + case TestResult::kRUNNING: return "RUNNING"; + case TestResult::kPASSED: return "PASSED"; + case TestResult::kFAILED: return "FAILED"; + case TestResult::kWAIVED: return "WAIVED"; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity + //! + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + //! + //! \brief method that implements logging test results + //! + static void reportTestResult(TestAtom const& testAtom, TestResult result) + { + severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " + << testAtom.mCmdline << std::endl; + } + + //! + //! \brief generate a command line string from the given (argc, argv) values + //! + static std::string genCmdlineString(int32_t argc, char const* const* argv) + { + std::stringstream ss; + for (int32_t i = 0; i < argc; i++) + { + if (i > 0) + { + ss << " "; + } + ss << argv[i]; + } + return ss.str(); + } + + Severity mReportableSeverity; +}; // class Logger + +namespace +{ +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE +//! +//! Example usage: +//! +//! LOG_VERBOSE(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO +//! +//! Example usage: +//! +//! LOG_INFO(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_INFO(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING +//! +//! Example usage: +//! +//! LOG_WARN(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_WARN(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR +//! +//! Example usage: +//! +//! LOG_ERROR(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_ERROR(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR +//! ("fatal" severity) +//! +//! Example usage: +//! +//! LOG_FATAL(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_FATAL(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); +} +} // anonymous namespace +} // namespace sample +#endif // TENSORRT_LOGGING_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h b/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h new file mode 100644 index 00000000..c92a1420 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PARSER_ONNX_CONFIG_H +#define PARSER_ONNX_CONFIG_H + +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxConfig.h" +#include "NvOnnxParser.h" + +#define ONNX_DEBUG 1 + +/** + * \class ParserOnnxConfig + * \brief Configuration Manager Class Concrete Implementation + * + * \note: + * + */ + +using namespace std; + +class ParserOnnxConfig : public nvonnxparser::IOnnxConfig +{ + +protected: + string mModelFilename{}; + string mTextFilename{}; + string mFullTextFilename{}; + nvinfer1::DataType mModelDtype; + nvonnxparser::IOnnxConfig::Verbosity mVerbosity; + bool mPrintLayercInfo; + +public: + ParserOnnxConfig() + : mModelDtype(nvinfer1::DataType::kFLOAT) + , mVerbosity(static_cast(nvinfer1::ILogger::Severity::kWARNING)) + , mPrintLayercInfo(false) + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << " ParserOnnxConfig::ctor(): " << this << "\t" << std::endl; + } +#endif + } + +protected: + ~ParserOnnxConfig() + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << "ParserOnnxConfig::dtor(): " << this << std::endl; + } +#endif + } + +public: + virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept + { + mModelDtype = modelDtype; + } + + virtual nvinfer1::DataType getModelDtype() const noexcept + { + return mModelDtype; + } + + virtual const char* getModelFileName() const noexcept + { + return mModelFilename.c_str(); + } + virtual void setModelFileName(const char* onnxFilename) noexcept + { + mModelFilename = string(onnxFilename); + } + virtual nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept + { + return mVerbosity; + } + virtual void addVerbosity() noexcept + { + ++mVerbosity; + } + virtual void reduceVerbosity() noexcept + { + --mVerbosity; + } + virtual void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept + { + mVerbosity = verbosity; + } + + virtual const char* getTextFileName() const noexcept + { + return mTextFilename.c_str(); + } + virtual void setTextFileName(const char* textFilename) noexcept + { + mTextFilename = string(textFilename); + } + virtual const char* getFullTextFileName() const noexcept + { + return mFullTextFilename.c_str(); + } + virtual void setFullTextFileName(const char* fullTextFilename) noexcept + { + mFullTextFilename = string(fullTextFilename); + } + virtual bool getPrintLayerInfo() const noexcept + { + return mPrintLayercInfo; + } + virtual void setPrintLayerInfo(bool src) noexcept + { + mPrintLayercInfo = src; + } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() + + virtual bool isDebug() const noexcept + { +#if ONNX_DEBUG + return (std::getenv("ONNX_DEBUG") ? true : false); +#else + return false; +#endif + } + + virtual void destroy() noexcept + { + delete this; + } + +}; // class ParserOnnxConfig + +#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h b/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h new file mode 100644 index 00000000..3d84b095 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_SAFE_COMMON_H +#define TENSORRT_SAFE_COMMON_H + +#include "NvInferRuntimeCommon.h" +#include +#include +#include +#include +#include + +#define CHECK(status) \ + do \ + { \ + auto ret = (status); \ + if (ret != 0) \ + { \ + std::cerr << "Cuda failure: " << ret << std::endl; \ + abort(); \ + } \ + } while (0) + +namespace samplesCommon +{ +template +inline std::shared_ptr infer_object(T* obj) +{ + if (!obj) + { + throw std::runtime_error("Failed to create object"); + } + return std::shared_ptr(obj); +} + +inline uint32_t elementSize(nvinfer1::DataType t) +{ + switch (t) + { + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kINT8: return 1; + case nvinfer1::DataType::kBOOL: return 1; + } + return 0; +} + +template +inline A divUp(A x, B n) +{ + return (x + n - 1) / n; +} + +} // namespace samplesCommon + +#endif // TENSORRT_SAFE_COMMON_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h new file mode 100644 index 00000000..53a78331 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h @@ -0,0 +1,337 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SampleConfig_H +#define SampleConfig_H + +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxConfig.h" +class SampleConfig : public nvonnxparser::IOnnxConfig +{ +public: + enum class InputDataFormat : int + { + kASCII = 0, + kPPM = 1 + }; + +private: + std::string mModelFilename; + std::string mEngineFilename; + std::string mTextFilename; + std::string mFullTextFilename; + std::string mImageFilename; + std::string mReferenceFilename; + std::string mOutputFilename; + std::string mCalibrationFilename; + std::string mTimingCacheFilename; + int64_t mLabel{-1}; + int64_t mMaxBatchSize{32}; + int64_t mCalibBatchSize{0}; + int64_t mMaxNCalibBatch{0}; + int64_t mFirstCalibBatch{0}; + int64_t mUseDLACore{-1}; + nvinfer1::DataType mModelDtype{nvinfer1::DataType::kFLOAT}; + bool mTF32{true}; + Verbosity mVerbosity{static_cast(nvinfer1::ILogger::Severity::kWARNING)}; + bool mPrintLayercInfo{false}; + bool mDebugBuilder{false}; + InputDataFormat mInputDataFormat{InputDataFormat::kASCII}; + uint64_t mTopK{0}; + float mFailurePercentage{-1.0f}; + float mTolerance{0.0f}; + float mAbsTolerance{1e-5f}; + +public: + SampleConfig() + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << " SampleConfig::ctor(): " << this << "\t" << std::endl; + } +#endif + } + +protected: + ~SampleConfig() + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << "SampleConfig::dtor(): " << this << std::endl; + } +#endif + } + +public: + void setModelDtype(const nvinfer1::DataType mdt) noexcept + { + mModelDtype = mdt; + } + + nvinfer1::DataType getModelDtype() const noexcept + { + return mModelDtype; + } + + bool getTF32() const noexcept + { + return mTF32; + } + + void setTF32(bool enabled) noexcept + { + mTF32 = enabled; + } + + const char* getModelFileName() const noexcept + { + return mModelFilename.c_str(); + } + + void setModelFileName(const char* onnxFilename) noexcept + { + mModelFilename = std::string(onnxFilename); + } + Verbosity getVerbosityLevel() const noexcept + { + return mVerbosity; + } + void addVerbosity() noexcept + { + ++mVerbosity; + } + void reduceVerbosity() noexcept + { + --mVerbosity; + } + virtual void setVerbosityLevel(Verbosity v) noexcept + { + mVerbosity = v; + } + const char* getEngineFileName() const noexcept + { + return mEngineFilename.c_str(); + } + void setEngineFileName(const char* engineFilename) noexcept + { + mEngineFilename = std::string(engineFilename); + } + const char* getTextFileName() const noexcept + { + return mTextFilename.c_str(); + } + void setTextFileName(const char* textFilename) noexcept + { + mTextFilename = std::string(textFilename); + } + const char* getFullTextFileName() const noexcept + { + return mFullTextFilename.c_str(); + } + void setFullTextFileName(const char* fullTextFilename) noexcept + { + mFullTextFilename = std::string(fullTextFilename); + } + void setLabel(int64_t label) noexcept + { + mLabel = label; + } //!< set the Label + + int64_t getLabel() const noexcept + { + return mLabel; + } //!< get the Label + + bool getPrintLayerInfo() const noexcept + { + return mPrintLayercInfo; + } + + void setPrintLayerInfo(bool b) noexcept + { + mPrintLayercInfo = b; + } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() + + void setMaxBatchSize(int64_t maxBatchSize) noexcept + { + mMaxBatchSize = maxBatchSize; + } //!< set the Max Batch Size + int64_t getMaxBatchSize() const noexcept + { + return mMaxBatchSize; + } //!< get the Max Batch Size + + void setCalibBatchSize(int64_t CalibBatchSize) noexcept + { + mCalibBatchSize = CalibBatchSize; + } //!< set the calibration batch size + int64_t getCalibBatchSize() const noexcept + { + return mCalibBatchSize; + } //!< get calibration batch size + + void setMaxNCalibBatch(int64_t MaxNCalibBatch) noexcept + { + mMaxNCalibBatch = MaxNCalibBatch; + } //!< set Max Number of Calibration Batches + int64_t getMaxNCalibBatch() const noexcept + { + return mMaxNCalibBatch; + } //!< get the Max Number of Calibration Batches + + void setFirstCalibBatch(int64_t FirstCalibBatch) noexcept + { + mFirstCalibBatch = FirstCalibBatch; + } //!< set the first calibration batch + int64_t getFirstCalibBatch() const noexcept + { + return mFirstCalibBatch; + } //!< get the first calibration batch + + void setUseDLACore(int64_t UseDLACore) noexcept + { + mUseDLACore = UseDLACore; + } //!< set the DLA core to use + int64_t getUseDLACore() const noexcept + { + return mUseDLACore; + } //!< get the DLA core to use + + void setDebugBuilder() noexcept + { + mDebugBuilder = true; + } //!< enable the Debug info, while building the engine. + bool getDebugBuilder() const noexcept + { + return mDebugBuilder; + } //!< get the boolean variable, corresponding to the debug builder + + const char* getImageFileName() const noexcept //!< set Image file name (PPM or ASCII) + { + return mImageFilename.c_str(); + } + void setImageFileName(const char* imageFilename) noexcept //!< get the Image file name + { + mImageFilename = std::string(imageFilename); + } + const char* getReferenceFileName() const noexcept + { + return mReferenceFilename.c_str(); + } + void setReferenceFileName(const char* referenceFilename) noexcept //!< set reference file name + { + mReferenceFilename = std::string(referenceFilename); + } + + void setInputDataFormat(InputDataFormat idt) noexcept + { + mInputDataFormat = idt; + } //!< specifies expected data format of the image file (PPM or ASCII) + InputDataFormat getInputDataFormat() const noexcept + { + return mInputDataFormat; + } //!< returns the expected data format of the image file. + + const char* getOutputFileName() const noexcept //!< specifies the file to save the results + { + return mOutputFilename.c_str(); + } + void setOutputFileName(const char* outputFilename) noexcept //!< get the output file name + { + mOutputFilename = std::string(outputFilename); + } + + const char* getCalibrationFileName() const noexcept + { + return mCalibrationFilename.c_str(); + } //!< specifies the file containing the list of image files for int8 calibration + void setCalibrationFileName(const char* calibrationFilename) noexcept //!< get the int 8 calibration list file name + { + mCalibrationFilename = std::string(calibrationFilename); + } + + uint64_t getTopK() const noexcept + { + return mTopK; + } + void setTopK(uint64_t topK) noexcept + { + mTopK = topK; + } //!< If this options is specified, return the K top probabilities. + + float getFailurePercentage() const noexcept + { + return mFailurePercentage; + } + + void setFailurePercentage(float f) noexcept + { + mFailurePercentage = f; + } + + float getAbsoluteTolerance() const noexcept + { + return mAbsTolerance; + } + + void setAbsoluteTolerance(float a) noexcept + { + mAbsTolerance = a; + } + + float getTolerance() const noexcept + { + return mTolerance; + } + + void setTolerance(float t) noexcept + { + mTolerance = t; + } + + const char* getTimingCacheFilename() const noexcept + { + return mTimingCacheFilename.c_str(); + } + + void setTimingCacheFileName(const char* timingCacheFilename) noexcept + { + mTimingCacheFilename = std::string(timingCacheFilename); + } + + bool isDebug() const noexcept + { +#if ONNX_DEBUG + return (std::getenv("ONNX_DEBUG") ? true : false); +#else + return false; +#endif + } + + void destroy() noexcept + { + delete this; + } + +}; // class SampleConfig + +#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h new file mode 100644 index 00000000..2053ac7c --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h @@ -0,0 +1,494 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_DEVICE_H +#define TRT_SAMPLE_DEVICE_H + +#include +#include +#include +#include +#include + +namespace sample +{ + +inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) +{ + if (ret != cudaSuccess) + { + err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; + abort(); + } +} + +class TrtCudaEvent; + +namespace +{ + +void cudaSleep(void* sleep) +{ + std::this_thread::sleep_for(std::chrono::duration(*static_cast(sleep))); +} + +} // namespace + +//! +//! \class TrtCudaStream +//! \brief Managed CUDA stream +//! +class TrtCudaStream +{ +public: + TrtCudaStream() + { + cudaCheck(cudaStreamCreate(&mStream)); + } + + TrtCudaStream(const TrtCudaStream&) = delete; + + TrtCudaStream& operator=(const TrtCudaStream&) = delete; + + TrtCudaStream(TrtCudaStream&&) = delete; + + TrtCudaStream& operator=(TrtCudaStream&&) = delete; + + ~TrtCudaStream() + { + cudaCheck(cudaStreamDestroy(mStream)); + } + + cudaStream_t get() const + { + return mStream; + } + + void synchronize() + { + cudaCheck(cudaStreamSynchronize(mStream)); + } + + void wait(TrtCudaEvent& event); + + void sleep(float* ms) + { + cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms)); + } + +private: + cudaStream_t mStream{}; +}; + +//! +//! \class TrtCudaEvent +//! \brief Managed CUDA event +//! +class TrtCudaEvent +{ +public: + explicit TrtCudaEvent(bool blocking = true) + { + const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault; + cudaCheck(cudaEventCreateWithFlags(&mEvent, flags)); + } + + TrtCudaEvent(const TrtCudaEvent&) = delete; + + TrtCudaEvent& operator=(const TrtCudaEvent&) = delete; + + TrtCudaEvent(TrtCudaEvent&&) = delete; + + TrtCudaEvent& operator=(TrtCudaEvent&&) = delete; + + ~TrtCudaEvent() + { + cudaCheck(cudaEventDestroy(mEvent)); + } + + cudaEvent_t get() const + { + return mEvent; + } + + void record(const TrtCudaStream& stream) + { + cudaCheck(cudaEventRecord(mEvent, stream.get())); + } + + void synchronize() + { + cudaCheck(cudaEventSynchronize(mEvent)); + } + + // Returns time elapsed time in milliseconds + float operator-(const TrtCudaEvent& e) const + { + float time{0}; + cudaCheck(cudaEventElapsedTime(&time, e.get(), get())); + return time; + } + +private: + cudaEvent_t mEvent{}; +}; + +inline void TrtCudaStream::wait(TrtCudaEvent& event) +{ + cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0)); +} + +//! +//! \class TrtCudaGraph +//! \brief Managed CUDA graph +//! +class TrtCudaGraph +{ +public: + explicit TrtCudaGraph() = default; + + TrtCudaGraph(const TrtCudaGraph&) = delete; + + TrtCudaGraph& operator=(const TrtCudaGraph&) = delete; + + TrtCudaGraph(TrtCudaGraph&&) = delete; + + TrtCudaGraph& operator=(TrtCudaGraph&&) = delete; + + ~TrtCudaGraph() + { + if (mGraphExec) + { + cudaGraphExecDestroy(mGraphExec); + } + } + + void beginCapture(TrtCudaStream& stream) + { + cudaCheck(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); + } + + bool launch(TrtCudaStream& stream) + { + return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess; + } + + void endCapture(TrtCudaStream& stream) + { + cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph)); + cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); + cudaCheck(cudaGraphDestroy(mGraph)); + } + + void endCaptureOnError(TrtCudaStream& stream) + { + // There are two possibilities why stream capture would fail: + // (1) stream is in cudaErrorStreamCaptureInvalidated state. + // (2) TRT reports a failure. + // In case (1), the returning mGraph should be nullptr. + // In case (2), the returning mGraph is not nullptr, but it should not be used. + const auto ret = cudaStreamEndCapture(stream.get(), &mGraph); + if (ret == cudaErrorStreamCaptureInvalidated) + { + assert(mGraph == nullptr); + } + else + { + assert(ret == cudaSuccess); + assert(mGraph != nullptr); + cudaCheck(cudaGraphDestroy(mGraph)); + mGraph = nullptr; + } + // Clean up any CUDA error. + cudaGetLastError(); + sample::gLogWarning << "The CUDA graph capture on the stream has failed." << std::endl; + } + +private: + cudaGraph_t mGraph{}; + cudaGraphExec_t mGraphExec{}; +}; + +//! +//! \class TrtCudaBuffer +//! \brief Managed buffer for host and device +//! +template +class TrtCudaBuffer +{ +public: + TrtCudaBuffer() = default; + + TrtCudaBuffer(const TrtCudaBuffer&) = delete; + + TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete; + + TrtCudaBuffer(TrtCudaBuffer&& rhs) + { + reset(rhs.mPtr); + rhs.mPtr = nullptr; + } + + TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) + { + if (this != &rhs) + { + reset(rhs.mPtr); + rhs.mPtr = nullptr; + } + return *this; + } + + ~TrtCudaBuffer() + { + reset(); + } + + TrtCudaBuffer(size_t size) + { + A()(&mPtr, size); + } + + void allocate(size_t size) + { + reset(); + A()(&mPtr, size); + } + + void reset(void* ptr = nullptr) + { + if (mPtr) + { + D()(mPtr); + } + mPtr = ptr; + } + + void* get() const + { + return mPtr; + } + +private: + void* mPtr{nullptr}; +}; + +struct DeviceAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMalloc(ptr, size)); + } +}; + +struct DeviceDeallocator +{ + void operator()(void* ptr) + { + cudaCheck(cudaFree(ptr)); + } +}; + +struct ManagedAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMallocManaged(ptr, size)); + } +}; + +struct HostAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMallocHost(ptr, size)); + } +}; + +struct HostDeallocator +{ + void operator()(void* ptr) + { + cudaCheck(cudaFreeHost(ptr)); + } +}; + +using TrtDeviceBuffer = TrtCudaBuffer; +using TrtManagedBuffer = TrtCudaBuffer; + +using TrtHostBuffer = TrtCudaBuffer; + +//! +//! \class MirroredBuffer +//! \brief Coupled host and device buffers +//! +class IMirroredBuffer +{ +public: + //! + //! Allocate memory for the mirrored buffer give the size + //! of the allocation. + //! + virtual void allocate(size_t size) = 0; + + //! + //! Get the pointer to the device side buffer. + //! + //! \return pointer to device memory or nullptr if uninitialized. + //! + virtual void* getDeviceBuffer() const = 0; + + //! + //! Get the pointer to the host side buffer. + //! + //! \return pointer to host memory or nullptr if uninitialized. + //! + virtual void* getHostBuffer() const = 0; + + //! + //! Copy the memory from host to device. + //! + virtual void hostToDevice(TrtCudaStream& stream) = 0; + + //! + //! Copy the memory from device to host. + //! + virtual void deviceToHost(TrtCudaStream& stream) = 0; + + //! + //! Interface to get the size of the memory + //! + //! \return the size of memory allocated. + //! + virtual size_t getSize() const = 0; + + //! + //! Virtual destructor declaraion + //! + virtual ~IMirroredBuffer() = default; + +}; // class IMirroredBuffer + +//! +//! Class to have a seperate memory buffer for discrete device and host allocations. +//! +class DiscreteMirroredBuffer : public IMirroredBuffer +{ +public: + void allocate(size_t size) + { + mSize = size; + mHostBuffer.allocate(size); + mDeviceBuffer.allocate(size); + } + + void* getDeviceBuffer() const + { + return mDeviceBuffer.get(); + } + + void* getHostBuffer() const + { + return mHostBuffer.get(); + } + + void hostToDevice(TrtCudaStream& stream) + { + cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); + } + + void deviceToHost(TrtCudaStream& stream) + { + cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); + } + + size_t getSize() const + { + return mSize; + } + +private: + size_t mSize{0}; + TrtHostBuffer mHostBuffer; + TrtDeviceBuffer mDeviceBuffer; +}; // class DiscreteMirroredBuffer + +//! +//! Class to have a unified memory buffer for embedded devices. +//! +class UnifiedMirroredBuffer : public IMirroredBuffer +{ +public: + void allocate(size_t size) + { + mSize = size; + mBuffer.allocate(size); + } + + void* getDeviceBuffer() const + { + return mBuffer.get(); + } + + void* getHostBuffer() const + { + return mBuffer.get(); + } + + void hostToDevice(TrtCudaStream& /*stream*/) + { + // Does nothing since we are using unified memory. + } + + void deviceToHost(TrtCudaStream& /*stream*/) + { + // Does nothing since we are using unified memory. + } + + size_t getSize() const + { + return mSize; + } + +private: + size_t mSize{0}; + TrtManagedBuffer mBuffer; +}; // class UnifiedMirroredBuffer + +inline void setCudaDevice(int device, std::ostream& os) +{ + cudaCheck(cudaSetDevice(device)); + + cudaDeviceProp properties; + cudaCheck(cudaGetDeviceProperties(&properties, device)); + +// clang-format off + os << "=== Device Information ===" << std::endl; + os << "Selected Device: " << properties.name << std::endl; + os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; + os << "SMs: " << properties.multiProcessorCount << std::endl; + os << "Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; + os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; + os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; + os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" + << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; + os << "Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; + // clang-format on +} + +} // namespace sample + +#endif // TRT_SAMPLE_DEVICE_H diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp similarity index 100% rename from src/Detector/tensorrt_yolo/common/sampleEngines.cpp rename to src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h new file mode 100644 index 00000000..620b51a1 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_ENGINES_H +#define TRT_SAMPLE_ENGINES_H + +#include +#include + +#include "NvInfer.h" + +#if (NV_TENSORRT_MAJOR > 7) + +#include "NvInferConsistency.h" +#include "NvInferSafeRuntime.h" + +#endif + +#include "NvOnnxParser.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +namespace sample +{ + +struct Parser +{ + TrtUniquePtr onnxParser; + + operator bool() const + { + return onnxParser.operator bool(); + } +}; + +struct BuildEnvironment +{ + TrtUniquePtr network; + //! Parser that creates the network. Must be declared *after* network, so that when + //! ~BuildEnvironment() executes, the parser is destroyed before the network is destroyed. + Parser parser; + TrtUniquePtr engine; + std::unique_ptr safeEngine; + std::vector engineBlob; +}; + +//! +//! \brief Generate a network definition for a given model +//! +//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid +//! parser (the returned parser converts to false if tested) +//! +//! Constant input dimensions in the model must not be changed in the corresponding +//! network definition, because its correctness may rely on the constants. +//! +//! \see Parser::operator bool() +//! +Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err); + +//! +//! \brief Set up network and config +//! +//! \return boolean Return true if network and config were successfully set +//! +bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, + nvinfer1::INetworkDefinition& network, nvinfer1::IBuilderConfig& config, std::ostream& err, + std::vector>& sparseWeights); + +//! +//! \brief Log refittable layers and weights of a refittable engine +//! +void dumpRefittable(nvinfer1::ICudaEngine& engine); + +//! +//! \brief Load a serialized engine +//! +//! \return Pointer to the engine loaded or nullptr if the operation failed +//! +nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err); + +//! +//! \brief Save an engine into a file +//! +//! \return boolean Return true if the engine was successfully saved +//! +bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err); + +//! +//! \brief Create an engine from model or serialized file, and optionally save engine +//! +//! \return Pointer to the engine created or nullptr if the creation failed +//! +bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, + BuildEnvironment& env, std::ostream& err); + +//! +//! \brief Create an engine from model or serialized file, and optionally save engine +//! +//! \return Pointer to the engine created or nullptr if the creation failed +//! +inline TrtUniquePtr getEngine( + const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err) +{ + BuildEnvironment env; + TrtUniquePtr engine; + if (getEngineBuildEnv(model, build, sys, env, err)) + { + engine.swap(env.engine); + } + return engine; +} + +//! +//! \brief Create a serialized network +//! +//! \return Pointer to a host memory for a serialized network +//! +nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, + nvinfer1::INetworkDefinition& network, std::ostream& err); + +//! +//! \brief Tranfer model to a serialized network +//! +//! \return Pointer to a host memory for a serialized network +//! +nvinfer1::IHostMemory* modelToSerialized(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); + +//! +//! \brief Serialize network and save it into a file +//! +//! \return boolean Return true if the network was successfully serialized and saved +//! +bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); + +bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading); + +//! +//! \brief Set tensor scales from a calibration table +//! +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector& inputFormats, + const std::vector& outputFormats, const std::string& calibrationFile); + +//! +//! \brief Check if safe runtime is loaded. +//! +bool hasSafeRuntime(); + +//! +//! \brief Create a safe runtime object if the dynamic library is loaded. +//! +nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept; + +//! +//! \brief Check if consistency checker is loaded. +//! +bool hasConsistencyChecker(); + +//! +//! \brief Create a consistency checker object if the dynamic library is loaded. +//! +nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( + nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept; + +//! +//! \brief Run consistency check on serialized engine. +//! +bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize); +} // namespace sample + +#endif // TRT_SAMPLE_ENGINES_H diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp similarity index 100% rename from src/Detector/tensorrt_yolo/common/sampleInference.cpp rename to src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h new file mode 100644 index 00000000..1c21f592 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_INFERENCE_H +#define TRT_SAMPLE_INFERENCE_H + +#include "sampleReporting.h" +#include "sampleUtils.h" + +#include +#include +#include +#include + +#include "NvInfer.h" + +#if (NV_TENSORRT_MAJOR > 7) + +#include "NvInferSafeRuntime.h" + +namespace sample +{ + +struct InferenceEnvironment +{ + TrtUniquePtr engine; + std::unique_ptr profiler; + std::vector> context; + std::vector> bindings; + bool error{false}; + + std::vector engineBlob; + + bool safe{false}; + std::unique_ptr safeEngine; + std::vector> safeContext; + + template + inline ContextType* getContext(int32_t streamIdx); +}; + +template <> +inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) +{ + return context[streamIdx].get(); +} + +template <> +inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) +{ + return safeContext[streamIdx].get(); +} + +//! +//! \brief Set up contexts and bindings for inference +//! +bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference); + +//! +//! \brief Deserialize the engine and time how long it takes. +//! +bool timeDeserialize(InferenceEnvironment& iEnv); + +//! +//! \brief Run inference and collect timing, return false if any error hit during inference +//! +bool runInference( + const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace); + +//! +//! \brief Get layer information of the engine. +//! +std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format); + +} // namespace sample + +#endif + +#endif // TRT_SAMPLE_INFERENCE_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp new file mode 100644 index 00000000..0afd163f --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp @@ -0,0 +1,1778 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +#include "logger.h" +#include "sampleOptions.h" + +namespace sample +{ + +namespace +{ + +std::vector splitToStringVec(const std::string& option, char separator) +{ + std::vector options; + + for (size_t start = 0; start < option.length();) + { + size_t separatorIndex = option.find(separator, start); + if (separatorIndex == std::string::npos) + { + separatorIndex = option.length(); + } + options.emplace_back(option.substr(start, separatorIndex - start)); + start = separatorIndex + 1; + } + + return options; +} + +template +T stringToValue(const std::string& option) +{ + return T{option}; +} + +template <> +int32_t stringToValue(const std::string& option) +{ + return std::stoi(option); +} + +template <> +float stringToValue(const std::string& option) +{ + return std::stof(option); +} + +template <> +double stringToValue(const std::string& option) +{ + return std::stod(option); +} + +template <> +bool stringToValue(const std::string& option) +{ + return true; +} + +template <> +std::vector stringToValue>(const std::string& option) +{ + std::vector shape; + std::vector dimsStrings = splitToStringVec(option, 'x'); + for (const auto& d : dimsStrings) + { + shape.push_back(stringToValue(d)); + } + return shape; +} + +template <> +nvinfer1::DataType stringToValue(const std::string& option) +{ + const std::unordered_map strToDT{{"fp32", nvinfer1::DataType::kFLOAT}, + {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8}, + {"int32", nvinfer1::DataType::kINT32}}; + const auto& dt = strToDT.find(option); + if (dt == strToDT.end()) + { + throw std::invalid_argument("Invalid DataType " + option); + } + return dt->second; +} + +template <> +nvinfer1::TensorFormats stringToValue(const std::string& option) +{ + std::vector optionStrings = splitToStringVec(option, '+'); + const std::unordered_map strToFmt{{"chw", nvinfer1::TensorFormat::kLINEAR}, + {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4}, + {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16}, + {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8}, + {"hwc", nvinfer1::TensorFormat::kHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, + {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}}; + nvinfer1::TensorFormats formats{}; + for (auto f : optionStrings) + { + const auto& tf = strToFmt.find(f); + if (tf == strToFmt.end()) + { + throw std::invalid_argument(std::string("Invalid TensorFormat ") + f); + } + formats |= 1U << static_cast(tf->second); + } + + return formats; +} + +template <> +IOFormat stringToValue(const std::string& option) +{ + IOFormat ioFormat{}; + const size_t colon = option.find(':'); + + if (colon == std::string::npos) + { + throw std::invalid_argument(std::string("Invalid IOFormat ") + option); + } + + ioFormat.first = stringToValue(option.substr(0, colon)); + ioFormat.second = stringToValue(option.substr(colon + 1)); + + return ioFormat; +} + +template +std::pair splitNameAndValue(const std::string& s) +{ + std::string tensorName; + std::string valueString; + // Split on the last : + std::vector nameRange{splitToStringVec(s, ':')}; + // Everything before the last : is the name + tensorName = nameRange[0]; + for (size_t i = 1; i < nameRange.size() - 1; i++) + { + tensorName += ":" + nameRange[i]; + } + // Value is the string element after the last : + valueString = nameRange[nameRange.size() - 1]; + return std::pair(tensorName, stringToValue(valueString)); +} + +template +void splitInsertKeyValue(const std::vector& kvList, T& map) +{ + for (const auto& kv : kvList) + { + map.insert(splitNameAndValue(kv)); + } +} + +const char* boolToEnabled(bool enable) +{ + return enable ? "Enabled" : "Disabled"; +} + +//! Check if input option exists in input arguments. +//! If it does: return its value, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) +{ + const auto match = arguments.find(option); + if (match != arguments.end()) + { + value = stringToValue(match->second); + arguments.erase(match); + return true; + } + + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: return false in value, erase the argument and return true. +//! If it does not: return false. +bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value) +{ + bool dummy; + if (getAndDelOption(arguments, option, dummy)) + { + value = false; + return true; + } + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: add all the matched arg values to values vector, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, std::vector& values) +{ + const auto match = arguments.equal_range(option); + if (match.first == match.second) + { + return false; + } + + auto addToValues = [&values](Arguments::value_type& argValue) {values.emplace_back(stringToValue(argValue.second));}; + std::for_each(match.first, match.second, addToValues); + arguments.erase(match.first, match.second); + + return true; +} + +void insertShapesBuild(std::unordered_map& shapes, nvinfer1::OptProfileSelector selector, const std::string& name, const std::vector& dims) +{ + shapes[name][static_cast(selector)] = dims; +} + +void insertShapesInference(std::unordered_map>& shapes, const std::string& name, const std::vector& dims) +{ + shapes[name] = dims; +} + +std::string removeSingleQuotationMarks(std::string& str) +{ + std::vector strList{splitToStringVec(str, '\'')}; + // Remove all the escaped single quotation marks + std::string retVal = ""; + // Do not really care about unterminated sequences + for (size_t i = 0; i < strList.size(); i++) + { + retVal += strList[i]; + } + return retVal; +} + +void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerPrecisions flag contains comma-separated layerName:precision pairs. + std::vector precisionList{splitToStringVec(list, ',')}; + for (auto const& s : precisionList) + { + auto namePrecisionPair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); + layerPrecisions[layerName] = namePrecisionPair.second; + } +} + +void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutputTypes& layerOutputTypes) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerOutputTypes flag contains comma-separated layerName:types pairs. + std::vector precisionList{splitToStringVec(list, ',')}; + for (auto const& s : precisionList) + { + auto namePrecisionPair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); + auto const typeStrings = splitToStringVec(namePrecisionPair.second, '+'); + std::vector typeVec(typeStrings.size(), nvinfer1::DataType::kFLOAT); + std::transform(typeStrings.begin(), typeStrings.end(), typeVec.begin(), stringToValue); + layerOutputTypes[layerName] = typeVec; + } +} + +bool getShapesBuild(Arguments& arguments, std::unordered_map& shapes, char const* argument, + nvinfer1::OptProfileSelector selector) +{ + std::string list; + bool retVal = getAndDelOption(arguments, argument, list); + std::vector shapeList{splitToStringVec(list, ',')}; + for (const auto& s : shapeList) + { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesBuild(shapes, selector, tensorName, dims); + } + return retVal; +} + +bool getShapesInference(Arguments& arguments, std::unordered_map>& shapes, const char* argument) +{ + std::string list; + bool retVal = getAndDelOption(arguments, argument, list); + std::vector shapeList{splitToStringVec(list, ',')}; + for (const auto& s : shapeList) + { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesInference(shapes, tensorName, dims); + } + return retVal; +} + +void processShapes(std::unordered_map& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib) +{ + // Only accept optShapes only or all three of minShapes, optShapes, maxShapes + if ( ((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes only, both minShapes and maxShapes + || (minShapes && !maxShapes && optShapes) // both minShapes and optShapes + || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes + { + if (calib) + { + throw std::invalid_argument("Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib"); + } + else + { + throw std::invalid_argument("Must specify only --optShapes or all of --minShapes, --optShapes, --maxShapes"); + } + } + + // If optShapes only, expand optShapes to minShapes and maxShapes + if (optShapes && !minShapes && !maxShapes) + { + std::unordered_map newShapes; + for (auto& s : shapes) + { + insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } + shapes = newShapes; + } +} + +template +void printShapes(std::ostream& os, const char* phase, const T& shapes) +{ + if (shapes.empty()) + { + os << "Input " << phase << " shapes: model" << std::endl; + } + else + { + for (const auto& s : shapes) + { + os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl; + } + } +} + +std::ostream& printBatch(std::ostream& os, int32_t maxBatch) +{ + if (maxBatch != maxBatchNotProvided) + { + os << maxBatch; + } + else + { + os << "explicit batch"; + } + return os; +} + +std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources) +{ + if (!enabledSources && !disabledSources) + { + os << "Using default tactic sources"; + } + else + { + auto const addSource = [&](uint32_t source, std::string const& name) { + if (enabledSources & source) + { + os << name << " [ON], "; + } + else if (disabledSources & source) + { + os << name << " [OFF], "; + } + }; + + addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS), "cublas"); + addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt"); +#if (NV_TENSORRT_MAJOR > 7) + addSource(1U << static_cast(nvinfer1::TacticSource::kCUDNN), "cudnn"); +#endif + } + return os; +} + +std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) +{ + os << "FP32"; + if (options.fp16) + { + os << "+FP16"; + } + if (options.int8) + { + os << "+INT8"; + } + if (options.precisionConstraints == PrecisionConstraints::kOBEY) + { + os << " (obey precision constraints)"; + } + if (options.precisionConstraints == PrecisionConstraints::kPREFER) + { + os << " (prefer precision constraints)"; + } + return os; +} + +std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options) +{ + switch (options.timingCacheMode) + { + case TimingCacheMode::kGLOBAL: os << "global"; break; + case TimingCacheMode::kLOCAL: os << "local"; break; + case TimingCacheMode::kDISABLE: os << "disable"; break; + } + return os; +} + +std::ostream& printSparsity(std::ostream& os, BuildOptions const& options) +{ + switch (options.sparsity) + { + case SparsityFlag::kDISABLE: os << "Disabled"; break; + case SparsityFlag::kENABLE: os << "Enabled"; break; + case SparsityFlag::kFORCE: os << "Forced"; break; + } + + return os; +} + +std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options) +{ + auto const printValueOrDefault = [&os](double const val) { + if (val >= 0) + { + os << val << " MiB"; + } + else + { + os << "default"; + } + }; + os << "workspace: "; printValueOrDefault(options.workspace); os << ", "; + os << "dlaSRAM: "; printValueOrDefault(options.dlaSRAM); os << ", "; + os << "dlaLocalDRAM: "; printValueOrDefault(options.dlaLocalDRAM); os << ", "; + os << "dlaGlobalDRAM: "; printValueOrDefault(options.dlaGlobalDRAM); + return os; +} + +} // namespace + +Arguments argsToArgumentsMap(int32_t argc, char* argv[]) +{ + Arguments arguments; + for (int32_t i = 1; i < argc; ++i) + { + auto valuePtr = strchr(argv[i], '='); + if (valuePtr) + { + std::string value{valuePtr + 1}; + arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value); + } + else + { + arguments.emplace(argv[i], ""); + } + } + return arguments; +} + +void BaseModelOptions::parse(Arguments& arguments) +{ + if (getAndDelOption(arguments, "--onnx", model)) + { + format = ModelFormat::kONNX; + } + else if (getAndDelOption(arguments, "--uff", model)) + { + format = ModelFormat::kUFF; + } + else if (getAndDelOption(arguments, "--model", model)) + { + format = ModelFormat::kCAFFE; + } +} + +void UffInput::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--uffNHWC", NHWC); + std::vector args; + if (getAndDelRepeatedOption(arguments, "--uffInput", args)) + { + for (const auto& i : args) + { + std::vector values{splitToStringVec(i, ',')}; + if (values.size() == 4) + { + nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])}; + inputs.emplace_back(values[0], dims); + } + else + { + throw std::invalid_argument(std::string("Invalid uffInput ") + i); + } + } + } +} + +void ModelOptions::parse(Arguments& arguments) +{ + baseModel.parse(arguments); + + switch (baseModel.format) + { + case ModelFormat::kCAFFE: + { + getAndDelOption(arguments, "--deploy", prototxt); + break; + } + case ModelFormat::kUFF: + { + uffInputs.parse(arguments); + if (uffInputs.inputs.empty()) + { + throw std::invalid_argument("Uff models require at least one input"); + } + break; + } + case ModelFormat::kONNX: + break; + case ModelFormat::kANY: + { + if (getAndDelOption(arguments, "--deploy", prototxt)) + { + baseModel.format = ModelFormat::kCAFFE; + } + break; + } + } + + // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX. + std::vector outArgs; + if (getAndDelRepeatedOption(arguments, "--output", outArgs)) + { + for (const auto& o : outArgs) + { + for (auto& v : splitToStringVec(o, ',')) + { + outputs.emplace_back(std::move(v)); + } + } + } + if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF) + { + if (outputs.empty()) + { + throw std::invalid_argument("Caffe and Uff models require at least one output"); + } + } + else if (baseModel.format == ModelFormat::kONNX) + { + if (!outputs.empty()) + { + throw std::invalid_argument("The --output flag should not be used with ONNX models."); + } + } +} + +void BuildOptions::parse(Arguments& arguments) +{ + auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { + std::string list; + getAndDelOption(arguments, argument, list); + std::vector formats{splitToStringVec(list, ',')}; + for (const auto& f : formats) + { + formatsVector.push_back(stringToValue(f)); + } + }; + + getFormats(inputFormats, "--inputIOFormats"); + getFormats(outputFormats, "--outputIOFormats"); + + bool addedExplicitBatchFlag{false}; + getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag); + if (addedExplicitBatchFlag) + { + sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl; + sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic " + << "shapes are provided when the engine is built." << std::endl; + } + + bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); + bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); + bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); + processShapes(shapes, minShapes, optShapes, maxShapes, false); + bool minShapesCalib + = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN); + bool optShapesCalib + = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT); + bool maxShapesCalib + = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX); + processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true); + + bool addedExplicitPrecisionFlag{false}; + getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag); + if (addedExplicitPrecisionFlag) + { + sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl; + } + + if (getAndDelOption(arguments, "--workspace", workspace)) + { + sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl; + } + + std::string memPoolSizes; + getAndDelOption(arguments, "--memPoolSize", memPoolSizes); + std::vector memPoolSpecs{splitToStringVec(memPoolSizes, ',')}; + for (auto const& memPoolSpec : memPoolSpecs) + { + std::string memPoolName; + double memPoolSize; + std::tie(memPoolName, memPoolSize) = splitNameAndValue(memPoolSpec); + if (memPoolSize < 0) + { + throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize)); + } + if (memPoolName == "workspace") + { + workspace = memPoolSize; + } + else if (memPoolName == "dlaSRAM") + { + dlaSRAM = memPoolSize; + } + else if (memPoolName == "dlaLocalDRAM") + { + dlaLocalDRAM = memPoolSize; + } + else if (memPoolName == "dlaGlobalDRAM") + { + dlaGlobalDRAM = memPoolSize; + } + else if (!memPoolName.empty()) + { + throw std::invalid_argument(std::string("Unknown memory pool: ") + memPoolName); + } + } + + getAndDelOption(arguments, "--maxBatch", maxBatch); + getAndDelOption(arguments, "--minTiming", minTiming); + getAndDelOption(arguments, "--avgTiming", avgTiming); + + bool best{false}; + getAndDelOption(arguments, "--best", best); + if (best) + { + int8 = true; + fp16 = true; + } + + getAndDelOption(arguments, "--refit", refittable); + getAndDelNegOption(arguments, "--noTF32", tf32); + getAndDelOption(arguments, "--fp16", fp16); + getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--safe", safe); + getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--restricted", restricted); + + getAndDelOption(arguments, "--directIO", directIO); + + std::string precisionConstraintsString; + getAndDelOption(arguments, "--precisionConstraints", precisionConstraintsString); + if (!precisionConstraintsString.empty()) + { + const std::unordered_map precisionConstraintsMap + = {{"obey", PrecisionConstraints::kOBEY}, {"prefer", PrecisionConstraints::kPREFER}, + {"none", PrecisionConstraints::kNONE}}; + auto it = precisionConstraintsMap.find(precisionConstraintsString); + if (it == precisionConstraintsMap.end()) + { + throw std::invalid_argument(std::string("Unknown precision constraints: ") + precisionConstraintsString); + } + precisionConstraints = it->second; + } + else + { + precisionConstraints = PrecisionConstraints::kNONE; + } + + getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions); + getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes); + + if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE) + { + sample::gLogWarning << "When --precisionConstraints flag is set to \"obey\" or \"prefer\", please add " + << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output " + << "types." << std::endl; + } + else if ((!layerPrecisions.empty() || !layerOutputTypes.empty()) + && precisionConstraints == PrecisionConstraints::kNONE) + { + sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints " + << "flag is set to \"none\"." << std::endl; + } + + std::string sparsityString; + getAndDelOption(arguments, "--sparsity", sparsityString); + if (sparsityString == "disable") + { + sparsity = SparsityFlag::kDISABLE; + } + else if (sparsityString == "enable") + { + sparsity = SparsityFlag::kENABLE; + } + else if (sparsityString == "force") + { + sparsity = SparsityFlag::kFORCE; + } + else if (!sparsityString.empty()) + { + throw std::invalid_argument(std::string("Unknown sparsity mode: ") + sparsityString); + } + + bool calibCheck = getAndDelOption(arguments, "--calib", calibration); + if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty()) + { + shapesCalib = shapes; + } + + std::string profilingVerbosityString; + if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString)) + { + sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl; + } + + getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString); + if (profilingVerbosityString == "layer_names_only") + { +#if (NV_TENSORRT_MAJOR > 7) + profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; +#else + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; +#endif + } + else if (profilingVerbosityString == "none") + { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE; + } +#if (NV_TENSORRT_MAJOR > 7) + else if (profilingVerbosityString == "detailed") + { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; + } +#endif + else if (profilingVerbosityString == "default") + { +#if (NV_TENSORRT_MAJOR > 7) + sample::gLogWarning << "--profilingVerbosity=default has been deprecated by " + "--profilingVerbosity=layer_names_only." + << std::endl; + profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; +#else + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; +#endif + } + else if (profilingVerbosityString == "verbose") + { +#if (NV_TENSORRT_MAJOR > 7) + sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed." + << std::endl; + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; +#else + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; +#endif + } + else if (!profilingVerbosityString.empty()) + { + throw std::invalid_argument(std::string("Unknown profilingVerbosity: ") + profilingVerbosityString); + } + + if (getAndDelOption(arguments, "--loadEngine", engine)) + { + load = true; + } + if (getAndDelOption(arguments, "--saveEngine", engine)) + { + save = true; + } + if (load && save) + { + throw std::invalid_argument("Incompatible load and save engine options selected"); + } + + std::string tacticSourceArgs; + if (getAndDelOption(arguments, "--tacticSources", tacticSourceArgs)) + { + std::vector tacticList = splitToStringVec(tacticSourceArgs, ','); + for (auto& t : tacticList) + { + bool enable{false}; + if (t.front() == '+') + { + enable = true; + } + else if (t.front() != '-') + { + throw std::invalid_argument( + "Tactic source must be prefixed with + or -, indicating whether it should be enabled or disabled " + "respectively."); + } + t.erase(0, 1); + + const auto toUpper = [](std::string& sourceName) { + std::transform( + sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); }); + return sourceName; + }; + + nvinfer1::TacticSource source{}; + t = toUpper(t); + if (t == "CUBLAS") + { + source = nvinfer1::TacticSource::kCUBLAS; + } + else if (t == "CUBLASLT" || t == "CUBLAS_LT") + { + source = nvinfer1::TacticSource::kCUBLAS_LT; + } +#if (NV_TENSORRT_MAJOR > 7) + else if (t == "CUDNN") + { + source = nvinfer1::TacticSource::kCUDNN; + } +#endif + else + { + throw std::invalid_argument(std::string("Unknown tactic source: ") + t); + } + + uint32_t sourceBit = 1U << static_cast(source); + + if (enable) + { + enabledTactics |= sourceBit; + } + else + { + disabledTactics |= sourceBit; + } + + if (enabledTactics & disabledTactics) + { + throw std::invalid_argument(std::string("Cannot enable and disable ") + t); + } + } + } + + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + if (noBuilderCache) + { + timingCacheMode = TimingCacheMode::kDISABLE; + } + else if (!timingCacheFile.empty()) + { + timingCacheMode = TimingCacheMode::kGLOBAL; + } + else + { + timingCacheMode = TimingCacheMode::kLOCAL; + } +} + +void SystemOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--device", device); + getAndDelOption(arguments, "--useDLACore", DLACore); + getAndDelOption(arguments, "--allowGPUFallback", fallback); + std::string pluginName; + while (getAndDelOption(arguments, "--plugins", pluginName)) + { + plugins.emplace_back(pluginName); + } +} + +void InferenceOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--streams", streams); + getAndDelOption(arguments, "--iterations", iterations); + getAndDelOption(arguments, "--duration", duration); + getAndDelOption(arguments, "--warmUp", warmup); + getAndDelOption(arguments, "--sleepTime", sleep); + getAndDelOption(arguments, "--idleTime", idle); + bool exposeDMA{false}; + if (getAndDelOption(arguments, "--exposeDMA", exposeDMA)) + { + overlap = !exposeDMA; + } + getAndDelOption(arguments, "--noDataTransfers", skipTransfers); + getAndDelOption(arguments, "--useManagedMemory", useManaged); + getAndDelOption(arguments, "--useSpinWait", spin); + getAndDelOption(arguments, "--threads", threads); + getAndDelOption(arguments, "--useCudaGraph", graph); + getAndDelOption(arguments, "--separateProfileRun", rerun); + getAndDelOption(arguments, "--buildOnly", skip); + getAndDelOption(arguments, "--timeDeserialize", timeDeserialize); + getAndDelOption(arguments, "--timeRefit", timeRefit); + + std::string list; + getAndDelOption(arguments, "--loadInputs", list); + std::vector inputsList{splitToStringVec(list, ',')}; + splitInsertKeyValue(inputsList, inputs); + + getShapesInference(arguments, shapes, "--shapes"); + getAndDelOption(arguments, "--batch", batch); +} + +void ReportingOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--percentile", percentile); + getAndDelOption(arguments, "--avgRuns", avgs); + getAndDelOption(arguments, "--verbose", verbose); + getAndDelOption(arguments, "--dumpRefit", refit); + getAndDelOption(arguments, "--dumpOutput", output); + getAndDelOption(arguments, "--dumpProfile", profile); + getAndDelOption(arguments, "--dumpLayerInfo", layerInfo); + getAndDelOption(arguments, "--exportTimes", exportTimes); + getAndDelOption(arguments, "--exportOutput", exportOutput); + getAndDelOption(arguments, "--exportProfile", exportProfile); + getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo); + if (percentile < 0 || percentile > 100) + { + throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]"); + } +} + +bool parseHelp(Arguments& arguments) +{ + bool helpLong{false}; + bool helpShort{false}; + getAndDelOption(arguments, "--help", helpLong); + getAndDelOption(arguments, "-h", helpShort); + return helpLong || helpShort; +} + +void AllOptions::parse(Arguments& arguments) +{ + model.parse(arguments); + build.parse(arguments); + system.parse(arguments); + inference.parse(arguments); + + // Use explicitBatch when input model is ONNX or when dynamic shapes are used. + const bool isOnnx{model.baseModel.format == ModelFormat::kONNX}; + const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()}; + const bool detectedExplicitBatch = isOnnx || hasDynamicShapes; + + // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim. + const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided}; + const bool batchWasSet{inference.batch != batchNotProvided}; + if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet)) + { + throw std::invalid_argument( + "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes " + "are provided. Please use --optShapes and --shapes to set input shapes instead."); + } + + // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values. + if (!detectedExplicitBatch) + { + // If batch is not set, set it to default value. + if (!batchWasSet) + { + inference.batch = defaultBatch; + } + // If maxBatch is not set, set it to be equal to batch. + if (!maxBatchWasSet) + { + build.maxBatch = inference.batch; + } + // MaxBatch should not be less than batch. + if (build.maxBatch < inference.batch) + { + throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch) + + " is less than inference batch " + std::to_string(inference.batch)); + } + } + + if (build.shapes.empty() && !inference.shapes.empty()) + { + // If --shapes are provided but --optShapes are not, assume that optShapes is the same as shapes. + for (auto& s : inference.shapes) + { + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second); + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second); + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second); + } + } + else if (!build.shapes.empty() && inference.shapes.empty()) + { + // If --optShapes are provided but --shapes are not, assume that shapes is the same as optShapes. + for (auto& s : build.shapes) + { + insertShapesInference( + inference.shapes, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } + } + + reporting.parse(arguments); + helps = parseHelp(arguments); + + if (!helps) + { + if (!build.load && model.baseModel.format == ModelFormat::kANY) + { + throw std::invalid_argument("Model missing or format not recognized"); + } + if (build.safe && system.DLACore >= 0) + { + auto checkSafeDLAFormats = [](std::vector const& fmt) { + return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) { + bool supported{false}; + bool const isLINEAR{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kLINEAR)}; + bool const isCHW4{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW4)}; + bool const isCHW32{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW32)}; + bool const isCHW16{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW16)}; + supported |= pair.first == nvinfer1::DataType::kINT8 && (isLINEAR || isCHW4 || isCHW32); + supported |= pair.first == nvinfer1::DataType::kHALF && (isLINEAR || isCHW4 || isCHW16); + return supported; + }); + }; + if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats)) + { + throw std::invalid_argument( + "I/O formats for safe DLA capability are restricted to fp16/int8:linear, fp16:chw16 or int8:chw32"); + } + if (system.fallback) + { + throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability"); + } + } + } +} + +void SafeBuilderOptions::parse(Arguments& arguments) +{ + auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { + std::string list; + getAndDelOption(arguments, argument, list); + std::vector formats{splitToStringVec(list, ',')}; + for (const auto& f : formats) + { + formatsVector.push_back(stringToValue(f)); + } + }; + + getAndDelOption(arguments, "--serialized", serialized); + getAndDelOption(arguments, "--onnx", onnxModelFile); + getAndDelOption(arguments, "--help", help); + getAndDelOption(arguments, "-h", help); + getAndDelOption(arguments, "--verbose", verbose); + getAndDelOption(arguments, "-v", verbose); + getFormats(inputFormats, "--inputIOFormats"); + getFormats(outputFormats, "--outputIOFormats"); + getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--calib", calibFile); + getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--std", standard); + std::string pluginName; + while (getAndDelOption(arguments, "--plugins", pluginName)) + { + plugins.emplace_back(pluginName); + } +} + +std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) +{ + os << "=== Model Options ===" << std::endl; + + os << "Format: "; + switch (options.format) + { + case ModelFormat::kCAFFE: + { + os << "Caffe"; + break; + } + case ModelFormat::kONNX: + { + os << "ONNX"; + break; + } + case ModelFormat::kUFF: + { + os << "UFF"; + break; + } + case ModelFormat::kANY: + os << "*"; + break; + } + os << std::endl << "Model: " << options.model << std::endl; + + return os; +} + +std::ostream& operator<<(std::ostream& os, const UffInput& input) +{ + os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl; + for (const auto& i : input.inputs) + { + os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, const ModelOptions& options) +{ + os << options.baseModel; + switch (options.baseModel.format) + { + case ModelFormat::kCAFFE: + { + os << "Prototxt: " << options.prototxt << std::endl; + break; + } + case ModelFormat::kUFF: + { + os << options.uffInputs; + break; + } + case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case + case ModelFormat::kANY: + break; + } + + os << "Output:"; + for (const auto& o : options.outputs) + { + os << " " << o; + } + os << std::endl; + + return os; +} + +std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) +{ + switch (dtype) + { + case nvinfer1::DataType::kFLOAT: + { + os << "fp32"; + break; + } + case nvinfer1::DataType::kHALF: + { + os << "fp16"; + break; + } + case nvinfer1::DataType::kINT8: + { + os << "int8"; + break; + } + case nvinfer1::DataType::kINT32: + { + os << "int32"; + break; + } + case nvinfer1::DataType::kBOOL: + { + os << "bool"; + break; + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, IOFormat const& format) +{ + os << format.first << ":"; + + for (int32_t f = 0; f < nvinfer1::EnumMax(); ++f) + { + if ((1U << f) & format.second) + { + if (f) + { + os << "+"; + } + switch (nvinfer1::TensorFormat(f)) + { + case nvinfer1::TensorFormat::kLINEAR: + { + os << "chw"; + break; + } + case nvinfer1::TensorFormat::kCHW2: + { + os << "chw2"; + break; + } + case nvinfer1::TensorFormat::kHWC8: + { + os << "hwc8"; + break; + } +#if (NV_TENSORRT_MAJOR > 7) + case nvinfer1::TensorFormat::kHWC16: + { + os << "hwc16"; + break; + } +#endif + case nvinfer1::TensorFormat::kCHW4: + { + os << "chw4"; + break; + } + case nvinfer1::TensorFormat::kCHW16: + { + os << "chw16"; + break; + } + case nvinfer1::TensorFormat::kCHW32: + { + os << "chw32"; + break; + } + case nvinfer1::TensorFormat::kDHWC8: + { + os << "dhwc8"; + break; + } + case nvinfer1::TensorFormat::kCDHW32: + { + os << "cdhw32"; + break; + } + case nvinfer1::TensorFormat::kHWC: + { + os << "hwc"; + break; + } + case nvinfer1::TensorFormat::kDLA_LINEAR: + { + os << "dla_linear"; + break; + } + case nvinfer1::TensorFormat::kDLA_HWC4: + { + os << "dla_hwc4"; + break; + } + } + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const ShapeRange& dims) +{ + int32_t i = 0; + for (const auto& d : dims) + { + if (!d.size()) + { + break; + } + os << (i ? "+" : "") << d; + ++i; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecisions) +{ + int32_t i = 0; + for (auto const& layerPrecision : layerPrecisions) + { + os << (i ? "," : "") << layerPrecision.first << ":" << layerPrecision.second; + ++i; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const BuildOptions& options) +{ + // clang-format off + os << "=== Build Options ===" << std::endl << + + "Max batch: "; printBatch(os, options.maxBatch) << std::endl << + "Memory Pools: "; printMemoryPools(os, options) << std::endl << + "minTiming: " << options.minTiming << std::endl << + "avgTiming: " << options.avgTiming << std::endl << + "Precision: "; printPrecision(os, options) << std::endl << + "LayerPrecisions: " << options.layerPrecisions << std::endl << + "Calibration: " << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl << + "Refit: " << boolToEnabled(options.refittable) << std::endl << + "Sparsity: "; printSparsity(os, options) << std::endl << + "Safe mode: " << boolToEnabled(options.safe) << std::endl << + "DirectIO mode: " << boolToEnabled(options.directIO) << std::endl << + "Restricted mode: " << boolToEnabled(options.restricted) << std::endl << + "Save engine: " << (options.save ? options.engine : "") << std::endl << + "Load engine: " << (options.load ? options.engine : "") << std::endl << + "Profiling verbosity: " << static_cast(options.profilingVerbosity) << std::endl << + "Tactic sources: "; printTacticSources(os, options.enabledTactics, options.disabledTactics) << std::endl << + "timingCacheMode: "; printTimingCache(os, options) << std::endl << + "timingCacheFile: " << options.timingCacheFile << std::endl; + // clang-format on + + auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { + if (formats.empty()) + { + os << direction << "s format: fp32:CHW" << std::endl; + } + else + { + for(const auto& f : formats) + { + os << direction << ": " << f << std::endl; + } + } + }; + + printIOFormats(os, "Input(s)", options.inputFormats); + printIOFormats(os, "Output(s)", options.outputFormats); + printShapes(os, "build", options.shapes); + printShapes(os, "calibration", options.shapesCalib); + + return os; +} + +std::ostream& operator<<(std::ostream& os, const SystemOptions& options) +{ + // clang-format off + os << "=== System Options ===" << std::endl << + + "Device: " << options.device << std::endl << + "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << + (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl; + os << "Plugins:"; + + for (const auto& p : options.plugins) + { + os << " " << p; + } + os << std::endl; + + return os; + // clang-format on +} + +std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) +{ +// clang-format off + os << "=== Inference Options ===" << std::endl << + + "Batch: "; + if (options.batch && options.shapes.empty()) + { + os << options.batch << std::endl; + } + else + { + os << "Explicit" << std::endl; + } + printShapes(os, "inference", options.shapes); + os << "Iterations: " << options.iterations << std::endl << + "Duration: " << options.duration << "s (+ " + << options.warmup << "ms warm up)" << std::endl << + "Sleep time: " << options.sleep << "ms" << std::endl << + "Idle time: " << options.idle << "ms" << std::endl << + "Streams: " << options.streams << std::endl << + "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << + "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << + "Spin-wait: " << boolToEnabled(options.spin) << std::endl << + "Multithreading: " << boolToEnabled(options.threads) << std::endl << + "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << + "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << + "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << + "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << + "Skip inference: " << boolToEnabled(options.skip) << std::endl; + +// clang-format on + os << "Inputs:" << std::endl; + for (const auto& input : options.inputs) + { + os << input.first << "<-" << input.second << std::endl; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, const ReportingOptions& options) +{ +// clang-format off + os << "=== Reporting Options ===" << std::endl << + + "Verbose: " << boolToEnabled(options.verbose) << std::endl << + "Averages: " << options.avgs << " inferences" << std::endl << + "Percentile: " << options.percentile << std::endl << + "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << + "Dump output: " << boolToEnabled(options.output) << std::endl << + "Profile: " << boolToEnabled(options.profile) << std::endl << + "Export timing to JSON file: " << options.exportTimes << std::endl << + "Export output to JSON file: " << options.exportOutput << std::endl << + "Export profile to JSON file: " << options.exportProfile << std::endl; +// clang-format on + + return os; +} + +std::ostream& operator<<(std::ostream& os, const AllOptions& options) +{ + os << options.model << options.build << options.system << options.inference << options.reporting << std::endl; + return os; +} + +std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) +{ + auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { + if (formats.empty()) + { + os << direction << "s format: fp32:CHW" << std::endl; + } + else + { + for(const auto& f : formats) + { + os << direction << ": " << f << std::endl; + } + } + }; + + os << "=== Build Options ===" << std::endl; + os << "Model ONNX: " << options.onnxModelFile << std::endl; + + os << "Precision: FP16"; + if (options.int8) + { + os << " + INT8"; + } + os << std::endl; + os << "Calibration file: " << options.calibFile << std::endl; + os << "Serialized Network: " << options.serialized << std::endl; + + printIOFormats(os, "Input(s)", options.inputFormats); + printIOFormats(os, "Output(s)", options.outputFormats); + + os << "Plugins:"; + for (const auto& p : options.plugins) + { + os << " " << p; + } + os << std::endl; + return os; +} + +void BaseModelOptions::help(std::ostream& os) +{ +// clang-format off + os << " --uff= UFF model" << std::endl << + " --onnx= ONNX model" << std::endl << + " --model= Caffe model (default = no model, random weights used)" << std::endl; +// clang-format on +} + +void UffInput::help(std::ostream& os) +{ +// clang-format off + os << " --uffInput=,X,Y,Z Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified " + "multiple times; at least one is required for UFF models" << std::endl << + " --uffNHWC Set if inputs are in the NHWC layout instead of NCHW (use " << + "X,Y,Z=H,W,C order in --uffInput)" << std::endl; +// clang-format on +} + +void ModelOptions::help(std::ostream& os) +{ +// clang-format off + os << "=== Model Options ===" << std::endl; + BaseModelOptions::help(os); + os << " --deploy= Caffe prototxt file" << std::endl << + " --output=[,]* Output names (it can be specified multiple times); at least one output " + "is required for UFF and Caffe" << std::endl; + UffInput::help(os); +// clang-format on +} + +void BuildOptions::help(std::ostream& os) +{ +// clang-format off + os << "=== Build Options ===" "\n" + " --maxBatch Set max batch size and build an implicit batch engine (default = same size as --batch)" "\n" + " This option should not be used when the input model is ONNX or when dynamic shapes are provided." "\n" + " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" + " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" + " Note: All three of min, opt and max shapes must be supplied." "\n" + " However, if only opt shapes is supplied then it will be expanded so" "\n" + " that min shapes and max shapes are set to the same values as opt shapes." "\n" + " Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." "\n" + " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" + " Each input shape is supplied as a key-value pair where key is the input name and" "\n" + " value is the dimensions (including the batch dimension) to be used for that input." "\n" + " Each key-value pair has the key and value separated using a colon (:)." "\n" + " Multiple input shapes can be provided via comma-separated key-value pairs." "\n" + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" + " See --outputIOFormats help for the grammar of type and format list." "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " inputs following the same order as network inputs ID (even if only one input" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " outputs following the same order as network outputs ID (even if only one output" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " IO Formats: spec ::= IOfmt[\",\"spec]" "\n" + " IOfmt ::= type:fmt" "\n" + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" + " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n" + " --workspace=N Set workspace size in MiB." "\n" + " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s) in MiB." "\n" + " Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n" + " Pool constraint: poolspec ::= poolfmt[\",\"poolspec]" "\n" + " poolfmt ::= pool:sizeInMiB" "\n" + " pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\"" "\n" + " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)" "\n" + " --minTiming=M Set the minimum number of iterations used in kernel selection (default = " + << defaultMinTiming << ")" "\n" + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " + << defaultAvgTiming << ")" "\n" + " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" + " and weights within the engine." "\n" + " --sparsity=spec Control sparsity (default = disabled). " "\n" + " Sparsity: spec ::= \"disable\", \"enable\", \"force\"" "\n" + " Note: Description about each of these options is as below" "\n" + " disable = do not enable sparse tactics in the builder (this is the default)" "\n" + " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" + " considered if the weights have the right sparsity pattern)" "\n" + " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" + " a sparsity pattern (even if you loaded a model yourself)" "\n" + " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" + " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" + " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" + " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" + " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" + " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" + " Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\"" "\n" + " none = no constraints" "\n" + " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" + " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" + " otherwise" "\n" + " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" + " \"obey\" or \"prefer\". (default = none)" "\n" + " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" + " layerName to specify the default precision for all the unspecified layers." "\n" + " Per-layer precision spec ::= layerPrecision[\",\"spec]" "\n" + " layerPrecision ::= layerName\":\"precision" "\n" + " precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" + " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" + " \"obey\" or \"prefer\". (default = none)" "\n" + " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" + " layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n" + " one output, then multiple types separated by \"+\" can be provided for this layer." "\n" + " Per-layer output type spec ::= layerOutputTypes[\",\"spec]" "\n" + " layerOutputTypes ::= layerName\":\"type" "\n" + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]" "\n" + " --calib= Read INT8 calibration cache file" "\n" + " --safe Enable build safety certified engine" "\n" + " --consistency Perform consistency checking on safety certified engine" "\n" + " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" + " --saveEngine= Save the serialized engine" "\n" + " --loadEngine= Load a serialized engine" "\n" + " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" + " tactic sources (default = all available tactics)." "\n" + " Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics." "\n" + " Tactic Sources: tactics ::= [\",\"tactic]" "\n" + " tactic ::= (+|-)lib" "\n" + " lib ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\"" "\n" + " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" + " --timingCacheFile= Save/load the serialized global timing cache" "\n" + ; +// clang-format on + os << std::flush; +} + +void SystemOptions::help(std::ostream& os) +{ +// clang-format off + os << "=== System Options ===" << std::endl << + " --device=N Select cuda device N (default = " << defaultDevice << ")" << std::endl << + " --useDLACore=N Select DLA core N for layers that support DLA (default = none)" << std::endl << + " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers " + "(default = disabled)" << std::endl; + os << " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl; +// clang-format on +} + +void InferenceOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Inference Options ===" << std::endl << + " --batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << + " This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl << + " shapes are provided when the engine is built." << std::endl << + " --shapes=spec Set input shapes for dynamic shapes inference inputs." << std::endl << + " Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." << std::endl << + " Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128" << std::endl << + " Each input shape is supplied as a key-value pair where key is the input name and" << std::endl << + " value is the dimensions (including the batch dimension) to be used for that input." << std::endl << + " Each key-value pair has the key and value separated using a colon (:)." << std::endl << + " Multiple input shapes can be provided via comma-separated key-value pairs." << std::endl << + " --loadInputs=spec Load input values from files (default = generate random inputs). Input names can be " + "wrapped with single quotes (ex: 'Input:0')" << std::endl << + " Input values spec ::= Ival[\",\"spec]" << std::endl << + " Ival ::= name\":\"file" << std::endl << + " --iterations=N Run at least N inference iterations (default = " << defaultIterations << ")" << std::endl << + " --warmUp=N Run for N milliseconds to warmup before measuring performance (default = " + << defaultWarmUp << ")" << std::endl << + " --duration=N Run performance measurements for at least N seconds wallclock time (default = " + << defaultDuration << ")" << std::endl << + " --sleepTime=N Delay inference start with a gap of N milliseconds between launch and compute " + "(default = " << defaultSleep << ")" << std::endl << + " --idleTime=N Sleep N milliseconds between two continuous iterations" + "(default = " << defaultIdle << ")" << std::endl << + " --streams=N Instantiate N engines to use concurrently (default = " << defaultStreams << ")" << std::endl << + " --exposeDMA Serialize DMA transfers to and from device (default = disabled)." << std::endl << + " --noDataTransfers Disable DMA transfers to and from device (default = enabled)." << std::endl << + " --useManagedMemory Use managed memory instead of seperate host and device allocations (default = disabled)." << std::endl << + " --useSpinWait Actively synchronize on GPU events. This option may decrease synchronization time but " + "increase CPU usage and power (default = disabled)" << std::endl << + " --threads Enable multithreading to drive engines with independent threads" + " or speed up refitting (default = disabled) " << std::endl << + " --useCudaGraph Use CUDA graph to capture engine execution and then launch inference (default = disabled)." << std::endl << + " This flag may be ignored if the graph capture fails." << std::endl << + " --timeDeserialize Time the amount of time it takes to deserialize the network and exit." << std::endl << + " --timeRefit Time the amount of time it takes to refit the engine before inference." << std::endl << + " --separateProfileRun Do not attach the profiler in the benchmark run; if profiling is enabled, a second " + "profile run will be executed (default = disabled)" << std::endl << + " --buildOnly Skip inference perf measurement (default = disabled)" << std::endl; + // clang-format on +} + +void ReportingOptions::help(std::ostream& os) +{ +// clang-format off + os << "=== Reporting Options ===" << std::endl << + " --verbose Use verbose logging (default = false)" << std::endl << + " --avgRuns=N Report performance measurements averaged over N consecutive " + "iterations (default = " << defaultAvgRuns << ")" << std::endl << + " --percentile=P Report performance for the P percentage (0<=P<=100, 0 " + "representing max perf, and 100 representing min perf; (default" + " = " << defaultPercentile << "%)" << std::endl << + " --dumpRefit Print the refittable layers and weights from a refittable " + "engine" << std::endl << + " --dumpOutput Print the output tensor(s) of the last inference iteration " + "(default = disabled)" << std::endl << + " --dumpProfile Print profile information per layer (default = disabled)" << std::endl << + " --dumpLayerInfo Print layer information of the engine to console " + "(default = disabled)" << std::endl << + " --exportTimes= Write the timing results in a json file (default = disabled)" << std::endl << + " --exportOutput= Write the output tensors to a json file (default = disabled)" << std::endl << + " --exportProfile= Write the profile information per layer in a json file " + "(default = disabled)" << std::endl << + " --exportLayerInfo= Write the layer information of the engine in a json file " + "(default = disabled)" << std::endl; +// clang-format on +} + +void helpHelp(std::ostream& os) +{ +// clang-format off + os << "=== Help ===" << std::endl << + " --help, -h Print this message" << std::endl; +// clang-format on +} + +void AllOptions::help(std::ostream& os) +{ + ModelOptions::help(os); + os << std::endl; + BuildOptions::help(os); + os << std::endl; + InferenceOptions::help(os); + os << std::endl; +// clang-format off + os << "=== Build and Inference Batch Options ===" << std::endl << + " When using implicit batch, the max batch size of the engine, if not given, " << std::endl << + " is set to the inference batch size;" << std::endl << + " when using explicit batch, if shapes are specified only for inference, they " << std::endl << + " will be used also as min/opt/max in the build profile; if shapes are " << std::endl << + " specified only for the build, the opt shapes will be used also for inference;" << std::endl << + " if both are specified, they must be compatible; and if explicit batch is " << std::endl << + " enabled but neither is specified, the model must provide complete static" << std::endl << + " dimensions, including batch size, for all inputs" << std::endl << + " Using ONNX models automatically forces explicit batch." << std::endl << + std::endl; + // clang-format on + ReportingOptions::help(os); + os << std::endl; + SystemOptions::help(os); + os << std::endl; + helpHelp(os); +} + +void SafeBuilderOptions::printHelp(std::ostream& os) +{ +// clang-format off + os << "=== Mandatory ===" << std::endl << + " --onnx= ONNX model" << std::endl << + " " << std::endl << + "=== Optional ===" << std::endl << + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" << std::endl << + " See --outputIOFormats help for the grammar of type and format list." << std::endl << + " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << + " inputs following the same order as network inputs ID (even if only one input" << std::endl << + " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" << std::endl << + " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << + " outputs following the same order as network outputs ID (even if only one output" << std::endl << + " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << + " IO Formats: spec ::= IOfmt[\",\"spec]" << std::endl << + " IOfmt ::= type:fmt" << std::endl << + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" << std::endl << + " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl << + " --int8 Enable int8 precision, in addition to fp16 (default = disabled)" << std::endl << + " --consistency Enable consistency check for serialized engine, (default = disabled)" << std::endl << + " --std Build standard serialized engine, (default = disabled)" << std::endl << + " --calib= Read INT8 calibration cache file" << std::endl << + " --serialized= Save the serialized network" << std::endl << + " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl << + " --verbose or -v Use verbose logging (default = false)" << std::endl << + " --help or -h Print this message" << std::endl << + " " << std::endl; +// clang-format on +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h new file mode 100644 index 00000000..8975e1ea --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h @@ -0,0 +1,355 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_OPTIONS_H +#define TRT_SAMPLE_OPTIONS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +namespace sample +{ + +// Build default params +constexpr int32_t maxBatchNotProvided{0}; +constexpr int32_t defaultMinTiming{1}; +constexpr int32_t defaultAvgTiming{8}; + +// System default params +constexpr int32_t defaultDevice{0}; + +// Inference default params +constexpr int32_t defaultBatch{1}; +constexpr int32_t batchNotProvided{0}; +constexpr int32_t defaultStreams{1}; +constexpr int32_t defaultIterations{10}; +constexpr float defaultWarmUp{200.F}; +constexpr float defaultDuration{3.F}; +constexpr float defaultSleep{}; +constexpr float defaultIdle{}; + +// Reporting default params +constexpr int32_t defaultAvgRuns{10}; +constexpr float defaultPercentile{99}; + +enum class PrecisionConstraints +{ + kNONE, + kOBEY, + kPREFER +}; + +enum class ModelFormat +{ + kANY, + kCAFFE, + kONNX, + kUFF +}; + +enum class SparsityFlag +{ + kDISABLE, + kENABLE, + kFORCE +}; + +enum class TimingCacheMode +{ + kDISABLE, + kLOCAL, + kGLOBAL +}; + +using Arguments = std::unordered_multimap; + +using IOFormat = std::pair; + +using ShapeRange = std::array, nvinfer1::EnumMax()>; + +using LayerPrecisions = std::unordered_map; +using LayerOutputTypes = std::unordered_map>; + +struct Options +{ + virtual void parse(Arguments& arguments) = 0; +}; + +struct BaseModelOptions : public Options +{ + ModelFormat format{ModelFormat::kANY}; + std::string model; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct UffInput : public Options +{ + std::vector> inputs; + bool NHWC{false}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct ModelOptions : public Options +{ + BaseModelOptions baseModel; + std::string prototxt; + std::vector outputs; + UffInput uffInputs; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct BuildOptions : public Options +{ + int32_t maxBatch{maxBatchNotProvided}; + double workspace{-1.0}; + double dlaSRAM{-1.0}; + double dlaLocalDRAM{-1.0}; + double dlaGlobalDRAM{-1.0}; + int32_t minTiming{defaultMinTiming}; + int32_t avgTiming{defaultAvgTiming}; + bool tf32{true}; + bool fp16{false}; + bool int8{false}; + bool directIO{false}; + PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE}; + LayerPrecisions layerPrecisions; + LayerOutputTypes layerOutputTypes; + bool safe{false}; + bool consistency{false}; + bool restricted{false}; + bool save{false}; + bool load{false}; + bool refittable{false}; + SparsityFlag sparsity{SparsityFlag::kDISABLE}; +#if (NV_TENSORRT_MAJOR > 7) + nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; +#else + nvinfer1::ProfilingVerbosity profilingVerbosity{ nvinfer1::ProfilingVerbosity::kDEFAULT }; +#endif + std::string engine; + std::string calibration; + std::unordered_map shapes; + std::unordered_map shapesCalib; + std::vector inputFormats; + std::vector outputFormats; + nvinfer1::TacticSources enabledTactics{0}; + nvinfer1::TacticSources disabledTactics{0}; + TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; + std::string timingCacheFile{}; + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct SystemOptions : public Options +{ + int32_t device{defaultDevice}; + int32_t DLACore{-1}; + bool fallback{false}; + std::vector plugins; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct InferenceOptions : public Options +{ + int32_t batch{batchNotProvided}; + int32_t iterations{defaultIterations}; + int32_t streams{defaultStreams}; + float warmup{defaultWarmUp}; + float duration{defaultDuration}; + float sleep{defaultSleep}; + float idle{defaultIdle}; + bool overlap{true}; + bool skipTransfers{false}; + bool useManaged{false}; + bool spin{false}; + bool threads{false}; + bool graph{false}; + bool skip{false}; + bool rerun{false}; + bool timeDeserialize{false}; + bool timeRefit{false}; + std::unordered_map inputs; + std::unordered_map> shapes; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct ReportingOptions : public Options +{ + bool verbose{false}; + int32_t avgs{defaultAvgRuns}; + float percentile{defaultPercentile}; + bool refit{false}; + bool output{false}; + bool profile{false}; + bool layerInfo{false}; + std::string exportTimes; + std::string exportOutput; + std::string exportProfile; + std::string exportLayerInfo; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct SafeBuilderOptions : public Options +{ + std::string serialized{}; + std::string onnxModelFile{}; + bool help{false}; + bool verbose{false}; + std::vector inputFormats; + std::vector outputFormats; + bool int8{false}; + std::string calibFile{}; + std::vector plugins; + bool consistency{false}; + bool standard{false}; + + void parse(Arguments& arguments) override; + + static void printHelp(std::ostream& out); +}; + +struct AllOptions : public Options +{ + ModelOptions model; + BuildOptions build; + SystemOptions system; + InferenceOptions inference; + ReportingOptions reporting; + bool helps{false}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +Arguments argsToArgumentsMap(int32_t argc, char* argv[]); + +bool parseHelp(Arguments& arguments); + +void helpHelp(std::ostream& out); + +// Functions to print options + +std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options); + +std::ostream& operator<<(std::ostream& os, const UffInput& input); + +std::ostream& operator<<(std::ostream& os, const IOFormat& format); + +std::ostream& operator<<(std::ostream& os, const ShapeRange& dims); + +std::ostream& operator<<(std::ostream& os, const ModelOptions& options); + +std::ostream& operator<<(std::ostream& os, const BuildOptions& options); + +std::ostream& operator<<(std::ostream& os, const SystemOptions& options); + +std::ostream& operator<<(std::ostream& os, const InferenceOptions& options); + +std::ostream& operator<<(std::ostream& os, const ReportingOptions& options); + +std::ostream& operator<<(std::ostream& os, const AllOptions& options); + +std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options); + +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) +{ + for (int32_t i = 0; i < dims.nbDims; ++i) + { + os << (i ? "x" : "") << dims.d[i]; + } + return os; +} +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole role) +{ + switch (role) + { + case nvinfer1::WeightsRole::kKERNEL: + { + os << "Kernel"; + break; + } + case nvinfer1::WeightsRole::kBIAS: + { + os << "Bias"; + break; + } + case nvinfer1::WeightsRole::kSHIFT: + { + os << "Shift"; + break; + } + case nvinfer1::WeightsRole::kSCALE: + { + os << "Scale"; + break; + } + case nvinfer1::WeightsRole::kCONSTANT: + { + os << "Constant"; + break; + } +#if (NV_TENSORRT_MAJOR > 7) + case nvinfer1::WeightsRole::kANY: + { + os << "Any"; + break; + } +#endif + } + + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector& vec) +{ + for (int32_t i = 0, e = static_cast(vec.size()); i < e; ++i) + { + os << (i ? "x" : "") << vec[i]; + } + return os; +} + +} // namespace sample + +#endif // TRT_SAMPLES_OPTIONS_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp new file mode 100644 index 00000000..a92938c5 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp @@ -0,0 +1,445 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleReporting.h" + +namespace sample +{ + +namespace +{ + +//! +//! \brief Find percentile in an ascending sequence of timings +//! \note percentile must be in [0, 100]. Otherwise, an exception is thrown. +//! +template +float findPercentile(float percentile, std::vector const& timings, T const& toFloat) +{ + int32_t const all = static_cast(timings.size()); + int32_t const exclude = static_cast((1 - percentile / 100) * all); + if (timings.empty()) + { + return std::numeric_limits::infinity(); + } + if (percentile < 0.0f || percentile > 100.0f) + { + throw std::runtime_error("percentile is not in [0, 100]!"); + } + return toFloat(timings[std::max(all - 1 - exclude, 0)]); +} + +//! +//! \brief Find median in a sorted sequence of timings +//! +template +float findMedian(std::vector const& timings, T const& toFloat) +{ + if (timings.empty()) + { + return std::numeric_limits::infinity(); + } + + int32_t const m = timings.size() / 2; + if (timings.size() % 2) + { + return toFloat(timings[m]); + } + + return (toFloat(timings[m - 1]) + toFloat(timings[m])) / 2; +} + +//! +//! \brief Find coefficient of variance (which is std / mean) in a sorted sequence of timings given the mean +//! +template +float findCoeffOfVariance(std::vector const& timings, T const& toFloat, float mean) +{ + if (timings.empty()) + { + return 0; + } + + if (mean == 0.F) + { + return std::numeric_limits::infinity(); + } + + auto const metricAccumulator = [toFloat, mean](float acc, InferenceTime const& a) { + float const diff = toFloat(a) - mean; + return acc + diff * diff; + }; + float const variance = std::accumulate(timings.begin(), timings.end(), 0.F, metricAccumulator) / timings.size(); + + return std::sqrt(variance) / mean * 100.F; +} + +inline InferenceTime traceToTiming(const InferenceTrace& a) +{ + return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), + (a.d2hEnd - a.d2hStart), (a.d2hEnd - a.h2dStart)); +} + +} // namespace + +void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTimeMs, std::ostream& os) +{ + os << "Warmup completed " << warmups << " queries over " << warmupMs << " ms" << std::endl; + os << "Timing trace has " << timings << " queries over " << benchTimeMs / 1000 << " s" << std::endl; +} + +void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os) +{ + int32_t count = 0; + InferenceTime sum; + + os << std::endl; + os << "=== Trace details ===" << std::endl; + os << "Trace averages of " << runsPerAvg << " runs:" << std::endl; + for (auto const& t : timings) + { + sum += t; + + if (++count == runsPerAvg) + { + // clang-format off + os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg + << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg + << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl; + // clang-format on + count = 0; + sum.enq = 0; + sum.h2d = 0; + sum.compute = 0; + sum.d2h = 0; + sum.e2e = 0; + } + } +} + +void printMetricExplanations(std::ostream& os) +{ + os << std::endl; + os << "=== Explanations of the performance metrics ===" << std::endl; + os << "Total Host Walltime: the host walltime from when the first query (after warmups) is enqueued to when the " + "last query is completed." + << std::endl; + os << "GPU Compute Time: the GPU latency to execute the kernels for a query." << std::endl; + os << "Total GPU Compute Time: the summation of the GPU Compute Time of all the queries. If this is significantly " + "shorter than Total Host Walltime, the GPU may be under-utilized because of host-side overheads or data " + "transfers." + << std::endl; + os << "Throughput: the observed throughput computed by dividing the number of queries by the Total Host Walltime. " + "If this is significantly lower than the reciprocal of GPU Compute Time, the GPU may be under-utilized " + "because of host-side overheads or data transfers." + << std::endl; + os << "Enqueue Time: the host latency to enqueue a query. If this is longer than GPU Compute Time, the GPU may be " + "under-utilized." + << std::endl; + os << "H2D Latency: the latency for host-to-device data transfers for input tensors of a single query." + << std::endl; + os << "D2H Latency: the latency for device-to-host data transfers for output tensors of a single query." + << std::endl; + os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a " + "single query." + << std::endl; + os << "End-to-End Host Latency: the duration from when the H2D of a query is called to when the D2H of the same " + "query is completed, which includes the latency to wait for the completion of the previous query. This is " + "the latency of a query if multiple queries are enqueued consecutively." + << std::endl; +} + +PerformanceResult getPerformanceResult(std::vector const& timings, + std::function metricGetter, float percentile) +{ + auto const metricComparator + = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); }; + auto const metricAccumulator = [metricGetter](float acc, InferenceTime const& a) { return acc + metricGetter(a); }; + std::vector newTimings = timings; + std::sort(newTimings.begin(), newTimings.end(), metricComparator); + PerformanceResult result; + result.min = metricGetter(newTimings.front()); + result.max = metricGetter(newTimings.back()); + result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size(); + result.median = findMedian(newTimings, metricGetter); + result.percentile = findPercentile(percentile, newTimings, metricGetter); + result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean); + return result; +} + +void printEpilog(std::vector const& timings, float walltimeMs, float percentile, int32_t batchSize, + std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +{ + float const throughput = batchSize * timings.size() / walltimeMs * 1000; + + auto const getLatency = [](InferenceTime const& t) { return t.latency(); }; + auto const latencyResult = getPerformanceResult(timings, getLatency, percentile); + + auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; }; + auto const e2eLatencyResult = getPerformanceResult(timings, getEndToEnd, percentile); + + auto const getEnqueue = [](InferenceTime const& t) { return t.enq; }; + auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentile); + + auto const getH2d = [](InferenceTime const& t) { return t.h2d; }; + auto const h2dResult = getPerformanceResult(timings, getH2d, percentile); + + auto const getCompute = [](InferenceTime const& t) { return t.compute; }; + auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentile); + + auto const getD2h = [](InferenceTime const& t) { return t.d2h; }; + auto const d2hResult = getPerformanceResult(timings, getD2h, percentile); + + auto const toPerfString = [percentile](const PerformanceResult& r) { + std::stringstream s; + s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, " + << "median = " << r.median << " ms, percentile(" << percentile << "%) = " << r.percentile << " ms"; + return s.str(); + }; + + osInfo << std::endl; + osInfo << "=== Performance summary ===" << std::endl; + osInfo << "Throughput: " << throughput << " qps" << std::endl; + osInfo << "Latency: " << toPerfString(latencyResult) << std::endl; + osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) << std::endl; + osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl; + osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl; + osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl; + osInfo << "D2H Latency: " << toPerfString(d2hResult) << std::endl; + osInfo << "Total Host Walltime: " << walltimeMs / 1000 << " s" << std::endl; + osInfo << "Total GPU Compute Time: " << gpuComputeResult.mean * timings.size() / 1000 << " s" << std::endl; + + // Report warnings if the throughput is bound by other factors than GPU Compute Time. + constexpr float kENQUEUE_BOUND_REPORTING_THRESHOLD{0.8F}; + if (enqueueResult.median > kENQUEUE_BOUND_REPORTING_THRESHOLD * gpuComputeResult.median) + { + osWarning + << "* Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized." + << std::endl; + osWarning << " If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the " + "throughput." + << std::endl; + } + if (h2dResult.median >= gpuComputeResult.median) + { + osWarning << "* Throughput may be bound by host-to-device transfers for the inputs rather than GPU Compute and " + "the GPU may be under-utilized." + << std::endl; + osWarning << " Add --noDataTransfers flag to disable data transfers." << std::endl; + } + if (d2hResult.median >= gpuComputeResult.median) + { + osWarning << "* Throughput may be bound by device-to-host transfers for the outputs rather than GPU Compute " + "and the GPU may be under-utilized." + << std::endl; + osWarning << " Add --noDataTransfers flag to disable data transfers." << std::endl; + } + + // Report warnings if the GPU Compute Time is unstable. + constexpr float kUNSTABLE_PERF_REPORTING_THRESHOLD{1.0F}; + if (gpuComputeResult.coeffVar > kUNSTABLE_PERF_REPORTING_THRESHOLD) + { + osWarning << "* GPU compute time is unstable, with coefficient of variance = " << gpuComputeResult.coeffVar + << "%." << std::endl; + osWarning << " If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the " + << "stability." << std::endl; + } + + // Explain what the metrics mean. + osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl; + printMetricExplanations(osVerbose); + + osInfo << std::endl; +} + +void printPerformanceReport(std::vector const& trace, const ReportingOptions& reporting, float warmupMs, + int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +{ + auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; }; + auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup); + int32_t const warmups = noWarmup - trace.begin(); + float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart; + // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch + // when explicit batch used, batchSize = options.inference.batch = 0 + // treat inference with explicit batch as a single query and report the throughput + batchSize = batchSize ? batchSize : 1; + printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo); + + std::vector timings(trace.size() - warmups); + std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming); + printTiming(timings, reporting.avgs, osInfo); + printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, osWarning, osVerbose); + + if (!reporting.exportTimes.empty()) + { + exportJSONTrace(trace, reporting.exportTimes); + } +} + +//! Printed format: +//! [ value, ...] +//! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time, +//! "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time, +//! "d2h" : time, "latency" : time, "end to end" : time } +//! +void exportJSONTrace(std::vector const& trace, std::string const& fileName) +{ + std::ofstream os(fileName, std::ofstream::trunc); + os << "[" << std::endl; + char const* sep = " "; + for (auto const& t : trace) + { + InferenceTime const it(traceToTiming(t)); + os << sep << "{ "; + sep = ", "; + // clang-format off + os << "\"startEnqMs\" : " << t.enqStart << sep << "\"endEnqMs\" : " << t.enqEnd << sep + << "\"startH2dMs\" : " << t.h2dStart << sep << "\"endH2dMs\" : " << t.h2dEnd << sep + << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep + << "\"startD2hMs\" : " << t.d2hStart << sep << "\"endD2hMs\" : " << t.d2hEnd << sep + << "\"h2dMs\" : " << it.h2d << sep << "\"computeMs\" : " << it.compute << sep + << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << sep + << "\"endToEndMs\" : " << it.e2e << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept +{ + if (mIterator == mLayers.end()) + { + bool const first = !mLayers.empty() && mLayers.begin()->name == layerName; + mUpdatesCount += mLayers.empty() || first; + if (first) + { + mIterator = mLayers.begin(); + } + else + { + mLayers.emplace_back(); + mLayers.back().name = layerName; + mIterator = mLayers.end() - 1; + } + } + + mIterator->timeMs += timeMs; + ++mIterator; +} + +void Profiler::print(std::ostream& os) const noexcept +{ + std::string const nameHdr("Layer"); + std::string const timeHdr(" Time (ms)"); + std::string const avgHdr(" Avg. Time (ms)"); + std::string const percentageHdr(" Time %"); + + float const totalTimeMs = getTotalTime(); + + auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); }; + auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer); + auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size()); + auto const timeLength = timeHdr.size(); + auto const avgLength = avgHdr.size(); + auto const percentageLength = percentageHdr.size(); + + os << std::endl + << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl + << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl; + + for (auto const& p : mLayers) + { + // clang-format off + os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs + << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100 + << std::endl; + } + { + os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2) + << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl; + // clang-format on + } + os << std::endl; +} + +void Profiler::exportJSONProfile(std::string const& fileName) const noexcept +{ + std::ofstream os(fileName, std::ofstream::trunc); + os << "[" << std::endl << " { \"count\" : " << mUpdatesCount << " }" << std::endl; + + auto const totalTimeMs = getTotalTime(); + + for (auto const& l : mLayers) + { + // clang-format off + os << ", {" << " \"name\" : \"" << l.name << "\"" + ", \"timeMs\" : " << l.timeMs + << ", \"averageMs\" : " << l.timeMs / mUpdatesCount + << ", \"percentage\" : " << l.timeMs / totalTimeMs * 100 + << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) +{ + os << "Input Tensors:" << std::endl; + bindings.dumpInputs(context, os); +} + +void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) +{ + os << "Output Tensors:" << std::endl; + bindings.dumpOutputs(context, os); +} + +void exportJSONOutput( + nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch) +{ + std::ofstream os(fileName, std::ofstream::trunc); + std::string sep = " "; + auto const output = bindings.getOutputBindings(); + os << "[" << std::endl; + for (auto const& binding : output) + { + // clang-format off + os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl; + sep = ", "; + os << " " << sep << "\"dimensions\" : \""; + bindings.dumpBindingDimensions(binding.second, context, os); + os << "\"" << std::endl; + os << " " << sep << "\"values\" : [ "; + bindings.dumpBindingValues(context, binding.second, os, sep, batch); + os << " ]" << std::endl << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h new file mode 100644 index 00000000..5f730987 --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_REPORTING_H +#define TRT_SAMPLE_REPORTING_H + +#include +#include + +#include "NvInfer.h" + +#include "sampleOptions.h" +#include "sampleUtils.h" + +namespace sample +{ + +//! +//! \struct InferenceTime +//! \brief Measurement times in milliseconds +//! +struct InferenceTime +{ + InferenceTime(float q, float i, float c, float o, float e) + : enq(q) + , h2d(i) + , compute(c) + , d2h(o) + , e2e(e) + { + } + + InferenceTime() = default; + InferenceTime(InferenceTime const&) = default; + InferenceTime(InferenceTime&&) = default; + InferenceTime& operator=(InferenceTime const&) = default; + InferenceTime& operator=(InferenceTime&&) = default; + ~InferenceTime() = default; + + float enq{0}; // Enqueue + float h2d{0}; // Host to Device + float compute{0}; // Compute + float d2h{0}; // Device to Host + float e2e{0}; // end to end + + // ideal latency + float latency() const + { + return h2d + compute + d2h; + } +}; + +//! +//! \struct InferenceTrace +//! \brief Measurement points in milliseconds +//! +struct InferenceTrace +{ + InferenceTrace(int32_t s, float es, float ee, float is, float ie, float cs, float ce, float os, float oe) + : stream(s) + , enqStart(es) + , enqEnd(ee) + , h2dStart(is) + , h2dEnd(ie) + , computeStart(cs) + , computeEnd(ce) + , d2hStart(os) + , d2hEnd(oe) + { + } + + InferenceTrace() = default; + InferenceTrace(InferenceTrace const&) = default; + InferenceTrace(InferenceTrace&&) = default; + InferenceTrace& operator=(InferenceTrace const&) = default; + InferenceTrace& operator=(InferenceTrace&&) = default; + ~InferenceTrace() = default; + + int32_t stream{0}; + float enqStart{0}; + float enqEnd{0}; + float h2dStart{0}; + float h2dEnd{0}; + float computeStart{0}; + float computeEnd{0}; + float d2hStart{0}; + float d2hEnd{0}; +}; + +inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b) +{ + return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h, a.e2e + b.e2e); +} + +inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) +{ + return a = a + b; +} + +//! +//! \struct PerformanceResult +//! \brief Performance result of a performance metric +//! +struct PerformanceResult +{ + float min{0}; + float max{0}; + float mean{0}; + float median{0}; + float percentile{0}; + float coeffVar{0}; // coefficient of variation +}; + +//! +//! \brief Print benchmarking time and number of traces collected +//! +void printProlog(int32_t warmups, int32_t timings, float warmupMs, float walltime, std::ostream& os); + +//! +//! \brief Print a timing trace +//! +void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os); + +//! +//! \brief Print the performance summary of a trace +//! +void printEpilog(std::vector const& timings, float percentile, int32_t batchSize, std::ostream& osInfo, + std::ostream& osWarning, std::ostream& osVerbose); + +//! +//! \brief Get the result of a specific performance metric from a trace +//! +PerformanceResult getPerformanceResult(std::vector const& timings, + std::function metricGetter, float percentile); + +//! +//! \brief Print the explanations of the performance metrics printed in printEpilog() function. +//! +void printMetricExplanations(std::ostream& os); + +//! +//! \brief Print and summarize a timing trace +//! +void printPerformanceReport(std::vector const& trace, ReportingOptions const& reporting, float warmupMs, + int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); + +//! +//! \brief Export a timing trace to JSON file +//! +void exportJSONTrace(std::vector const& trace, std::string const& fileName); + +//! +//! \brief Print input tensors to stream +//! +void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + +//! +//! \brief Print output tensors to stream +//! +void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + +//! +//! \brief Export output tensors to JSON file +//! +void exportJSONOutput( + nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch); + +//! +//! \struct LayerProfile +//! \brief Layer profile information +//! +struct LayerProfile +{ + std::string name; + float timeMs{0}; +}; + +//! +//! \class Profiler +//! \brief Collect per-layer profile information, assuming times are reported in the same order +//! +class Profiler : public nvinfer1::IProfiler +{ + +public: + void reportLayerTime(char const* layerName, float timeMs) noexcept override; + + void print(std::ostream& os) const noexcept; + + //! + //! \brief Export a profile to JSON file + //! + void exportJSONProfile(std::string const& fileName) const noexcept; + +private: + float getTotalTime() const noexcept + { + auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs; }; + return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime); + } + + std::vector mLayers; + std::vector::iterator mIterator{mLayers.begin()}; + int32_t mUpdatesCount{0}; +}; + +} // namespace sample + +#endif // TRT_SAMPLE_REPORTING_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h new file mode 100644 index 00000000..1509a7fc --- /dev/null +++ b/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h @@ -0,0 +1,543 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_UTILS_H +#define TRT_SAMPLE_UTILS_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "NvInfer.h" + +#include "common.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleOptions.h" + +namespace sample +{ + +inline int dataTypeSize(nvinfer1::DataType dataType) +{ + switch (dataType) + { + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kINT8: return 1; + } + return 0; +} + +template +inline T roundUp(T m, T n) +{ + return ((m + n - 1) / n) * n; +} + +inline int volume(const nvinfer1::Dims& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); +} + +//! comps is the number of components in a vector. Ignored if vecDim < 0. +inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch) +{ + int maxNbElems = 1; + for (int i = 0; i < dims.nbDims; ++i) + { + // Get effective length of axis. + int d = dims.d[i]; + // Any dimension is 0, it is an empty tensor. + if (d == 0) + { + return 0; + } + if (i == vecDim) + { + d = samplesCommon::divUp(d, comps); + } + maxNbElems = std::max(maxNbElems, d * strides.d[i]); + } + return static_cast(maxNbElems) * batch * (vecDim < 0 ? 1 : comps); +} + +inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch) +{ + if (vecDim != -1) + { + dims.d[vecDim] = roundUp(dims.d[vecDim], comps); + } + return volume(dims) * std::max(batch, 1); +} + +inline nvinfer1::Dims toDims(const std::vector& vec) +{ + int limit = static_cast(nvinfer1::Dims::MAX_DIMS); + if (static_cast(vec.size()) > limit) + { + sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; + } + // Pick first nvinfer1::Dims::MAX_DIMS elements + nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; + std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); + return dims; +} + +template +inline void fillBuffer(void* buffer, int64_t volume, T min, T max) +{ + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + if (std::is_integral::value) + { + std::uniform_int_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); + } + else + { + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); + } +} + +// Specialization needed for custom type __half +template +inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max) +{ + H* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} +template <> +inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max) +{ + fillBufferHalf(buffer, volume, min, max); +} + +template +inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const nvinfer1::Dims& dims, + const nvinfer1::Dims& strides, int32_t vectorDim, int32_t spv) +{ + const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); + const T* typedBuffer = static_cast(buffer); + std::string sep; + for (int64_t v = 0; v < volume; ++v) + { + int64_t curV = v; + int32_t dataOffset = 0; + for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) + { + int32_t dimVal = curV % dims.d[dimIndex]; + if (dimIndex == vectorDim) + { + dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; + } + else + { + dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); + } + curV /= dims.d[dimIndex]; + ASSERT(curV >= 0); + } + + os << sep << typedBuffer[dataOffset]; + sep = separator; + } +} + +inline void loadFromFile(std::string const& fileName, char* dst, size_t size) +{ + ASSERT(dst); + + std::ifstream file(fileName, std::ios::in | std::ios::binary); + if (file.is_open()) + { + file.read(dst, size); + file.close(); + } + else + { + std::stringstream msg; + msg << "Cannot open file " << fileName << "!"; + throw std::invalid_argument(msg.str()); + } +} + +struct Binding +{ + bool isInput{false}; + std::unique_ptr buffer; + int64_t volume{0}; + nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; + + void fill(const std::string& fileName) + { + loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); + } + + void fill() + { + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + fillBuffer(buffer->getHostBuffer(), volume, 0, 1); + break; + } + case nvinfer1::DataType::kINT32: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kINT8: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kFLOAT: + { + fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kHALF: + { + fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + } + } + + void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv, + const std::string separator = " ") const + { + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT32: + { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT8: + { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kFLOAT: + { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kHALF: + { + dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); + break; + } + } + } +}; + +class Bindings +{ +public: + Bindings() = delete; + explicit Bindings(bool useManaged) + : mUseManaged(useManaged) + { + } + + void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType, + const std::string& fileName = "") + { + while (mBindings.size() <= static_cast(b)) + { + mBindings.emplace_back(); + mDevicePointers.emplace_back(); + } + mNames[name] = b; + if (mBindings[b].buffer == nullptr) + { + if (mUseManaged) + mBindings[b].buffer.reset(new UnifiedMirroredBuffer); + else + mBindings[b].buffer.reset(new DiscreteMirroredBuffer); + } + mBindings[b].isInput = isInput; + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + if (volume == 0) + mBindings[b].buffer->allocate(1); + else + mBindings[b].buffer->allocate(static_cast(volume) * static_cast(dataTypeSize(dataType))); + + mBindings[b].volume = volume; + mBindings[b].dataType = dataType; + mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); + if (isInput) + { + if (fileName.empty()) + fill(b); + else + fill(b, fileName); + } + } + + void** getDeviceBuffers() + { + return mDevicePointers.data(); + } + + void transferInputToDevice(TrtCudaStream& stream) + { + for (auto& b : mNames) + { + if (mBindings[b.second].isInput) + mBindings[b.second].buffer->hostToDevice(stream); + } + } + + void transferOutputToHost(TrtCudaStream& stream) + { + for (auto& b : mNames) + { + if (!mBindings[b.second].isInput) + mBindings[b.second].buffer->deviceToHost(stream); + } + } + + void fill(int binding, const std::string& fileName) + { + mBindings[binding].fill(fileName); + } + + void fill(int binding) + { + mBindings[binding].fill(); + } + + void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const + { + const auto dims = context.getBindingDimensions(binding); + // Do not add a newline terminator, because the caller may be outputting a JSON string. + os << dims; + } + + void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os, + const std::string& separator = " ", int32_t batch = 1) const + { + nvinfer1::Dims dims = context.getBindingDimensions(binding); + nvinfer1::Dims strides = context.getStrides(binding); + int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding); + const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding); + + if (context.getEngine().hasImplicitBatchDimension()) + { + auto insertN = [](nvinfer1::Dims& d, int32_t bs) { + const int32_t nbDims = d.nbDims; + ASSERT(nbDims < nvinfer1::Dims::MAX_DIMS); + std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]); + d.d[0] = bs; + d.nbDims = nbDims + 1; + }; + int32_t batchStride = 0; + for (int32_t i = 0; i < strides.nbDims; ++i) + { + if (strides.d[i] * dims.d[i] > batchStride) + { + batchStride = strides.d[i] * dims.d[i]; + } + } + insertN(dims, batch); + insertN(strides, batchStride); + vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1; + } + + mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); + } + + void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const + { + auto isInput = [](const Binding& b) { return b.isInput; }; + dumpBindings(context, isInput, os); + } + + void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const + { + auto isOutput = [](const Binding& b) { return !b.isInput; }; + dumpBindings(context, isOutput, os); + } + + void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const + { + auto all = [](const Binding& /*b*/) { return true; }; + dumpBindings(context, all, os); + } + + void dumpBindings( + const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const + { + for (const auto& n : mNames) + { + const auto binding = n.second; + if (predicate(mBindings[binding])) + { + os << n.first << ": ("; + dumpBindingDimensions(binding, context, os); + os << ")" << std::endl; + + dumpBindingValues(context, binding, os); + os << std::endl; + } + } + } + + std::unordered_map getInputBindings() const + { + auto isInput = [](const Binding& b) { return b.isInput; }; + return getBindings(isInput); + } + + std::unordered_map getOutputBindings() const + { + auto isOutput = [](const Binding& b) { return !b.isInput; }; + return getBindings(isOutput); + } + + std::unordered_map getBindings() const + { + auto all = [](const Binding& /*b*/) { return true; }; + return getBindings(all); + } + + std::unordered_map getBindings(bool (*predicate)(const Binding& b)) const + { + std::unordered_map bindings; + for (const auto& n : mNames) + { + const auto binding = n.second; + if (predicate(mBindings[binding])) + bindings.insert(n); + } + return bindings; + } + +private: + std::unordered_map mNames; + std::vector mBindings; + std::vector mDevicePointers; + bool mUseManaged{false}; +}; + +template +struct TrtDestroyer +{ + void operator()(T* t) + { + //t->destroy(); + delete t; + } +}; + +template +using TrtUniquePtr = std::unique_ptr>; + +inline bool broadcastIOFormats(const std::vector& formats, size_t nbBindings, bool isInput = true) +{ + bool broadcast = formats.size() == 1; + bool validFormatsCount = broadcast || (formats.size() == nbBindings); + if (!formats.empty() && !validFormatsCount) + { + if (isInput) + { + throw std::invalid_argument( + "The number of inputIOFormats must match network's inputs or be one for broadcasting."); + } + else + { + throw std::invalid_argument( + "The number of outputIOFormats must match network's outputs or be one for broadcasting."); + } + } + return broadcast; +} + +inline std::vector loadTimingCacheFile(const std::string inFileName) +{ + std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); + if (!iFile) + { + sample::gLogWarning << "Could not read timing cache from: " << inFileName + << ". A new timing cache will be generated and written." << std::endl; + return std::vector(); + } + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl; + return content; +} + +inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) +{ + std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); + if (!oFile) + { + sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl; + return; + } + oFile.write((char*) blob->data(), blob->size()); + oFile.close(); + sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl; +} + +inline int32_t getCudaDriverVersion() +{ + int32_t version{-1}; + cudaCheck(cudaDriverGetVersion(&version)); + return version; +} + +inline int32_t getCudaRuntimeVersion() +{ + int32_t version{-1}; + cudaCheck(cudaRuntimeGetVersion(&version)); + return version; +} + +} // namespace sample + +#endif // TRT_SAMPLE_UTILS_H diff --git a/src/Detector/tensorrt_yolo/yolo.cpp b/src/Detector/tensorrt_yolo/yolo.cpp index a60d3dc4..4ee202b6 100644 --- a/src/Detector/tensorrt_yolo/yolo.cpp +++ b/src/Detector/tensorrt_yolo/yolo.cpp @@ -78,7 +78,31 @@ Yolo::Yolo(const NetworkInfo& networkInfo, const InferParams& inferParams) assert(m_Engine != nullptr); m_Context = m_Engine->createExecutionContext(); assert(m_Context != nullptr); + + auto numBindings = m_Engine->getNbIOTensors(); + //std::cout << "** Bindings: " << numBindings << " **" << std::endl; + for (int32_t i = 0; i < numBindings; ++i) + { + std::string bindName = m_Engine->getIOTensorName(i); + m_tensorNames.emplace(bindName, i); + nvinfer1::Dims dim = m_Engine->getTensorShape(bindName.c_str()); + + std::cout << i << ": name: " << bindName; + std::cout << ", size: "; + for (int j = 0; j < dim.nbDims; ++j) + { + std::cout << dim.d[j]; + if (j < dim.nbDims - 1) + std::cout << "x"; + } + std::cout << std::endl; + + if (m_InputBlobName == bindName) + m_InputBindingIndex = i; + } +#if (NV_TENSORRT_MAJOR < 9) m_InputBindingIndex = m_Engine->getBindingIndex(m_InputBlobName.c_str()); +#endif assert(m_InputBindingIndex != -1); assert(m_BatchSize <= static_cast(m_Engine->getMaxBatchSize())); allocateBuffers(); @@ -464,7 +488,14 @@ void Yolo::createYOLOEngine(const nvinfer1::DataType dataType, Int8EntropyCalibr // Build the engine std::cout << "Building the TensorRT Engine..." << std::endl; - m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config); +#if (NV_TENSORRT_MAJOR < 9) + m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config); +#else + nvinfer1::IRuntime* inferRuntime = nvinfer1::createInferRuntime(m_Logger); + nvinfer1::IHostMemory* serialNetmork = m_Builder->buildSerializedNetwork(*m_Network, *config); + m_Engine = inferRuntime->deserializeCudaEngine(serialNetmork->data(), serialNetmork->size()); + delete inferRuntime; +#endif assert(m_Engine != nullptr); std::cout << "Building complete!" << std::endl; @@ -942,7 +973,15 @@ void Yolo::create_engine_yolov5(const nvinfer1::DataType dataType, Int8EntropyCa #endif // Build the engine std::cout << "Building the TensorRT Engine..." << std::endl; +#if (NV_TENSORRT_MAJOR < 9) m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config); +#else + nvinfer1::IRuntime* inferRuntime = nvinfer1::createInferRuntime(m_Logger); + nvinfer1::IHostMemory* serialNetmork = m_Builder->buildSerializedNetwork(*m_Network, *config); + m_Engine = inferRuntime->deserializeCudaEngine(serialNetmork->data(), serialNetmork->size()); + delete inferRuntime; +#endif + assert(m_Engine != nullptr); std::cout << "Building complete!" << std::endl; @@ -987,7 +1026,8 @@ void Yolo::doInference(const unsigned char* input, const uint32_t batchSize) batchSize * m_InputSize * sizeof(float), cudaMemcpyHostToDevice, m_CudaStream)); - m_Context->enqueue(batchSize, m_DeviceBuffers.data(), m_CudaStream, nullptr); + //m_Context->enqueueV3(batchSize, m_DeviceBuffers.data(), m_CudaStream, nullptr); + m_Context->enqueueV3(m_CudaStream); for (auto& tensor : m_OutputTensors) { NV_CUDA_CHECK(cudaMemcpyAsync(tensor.hostBuffer, m_DeviceBuffers.at(tensor.bindingIndex), @@ -1249,8 +1289,7 @@ void Yolo::parse_cfg_blocks_v5(const std::vectorgetNbBindings(), nullptr); + m_DeviceBuffers.resize(m_Engine->getNbIOTensors(), nullptr); assert(m_InputBindingIndex != -1 && "Invalid input binding index"); - NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(m_InputBindingIndex), - m_BatchSize * m_InputSize * sizeof(float))); + NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(m_InputBindingIndex), m_BatchSize * m_InputSize * sizeof(float))); for (auto& tensor : m_OutputTensors) { +#if (NV_TENSORRT_MAJOR < 9) tensor.bindingIndex = m_Engine->getBindingIndex(tensor.blobName.c_str()); +#else + auto it = m_tensorNames.find(tensor.blobName); + tensor.bindingIndex = (it != std::end(m_tensorNames)) ? it->second : -1; +#endif assert((tensor.bindingIndex != -1) && "Invalid output binding index"); - NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(tensor.bindingIndex), - m_BatchSize * tensor.volume * sizeof(float))); - NV_CUDA_CHECK( - cudaMallocHost(&tensor.hostBuffer, tensor.volume * m_BatchSize * sizeof(float))); + NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(tensor.bindingIndex), m_BatchSize * tensor.volume * sizeof(float))); + NV_CUDA_CHECK(cudaMallocHost((void**)&tensor.hostBuffer, tensor.volume * m_BatchSize * sizeof(float))); } } diff --git a/src/Detector/tensorrt_yolo/yolo.h b/src/Detector/tensorrt_yolo/yolo.h index be347d19..4cfdba16 100644 --- a/src/Detector/tensorrt_yolo/yolo.h +++ b/src/Detector/tensorrt_yolo/yolo.h @@ -158,6 +158,7 @@ class Yolo std::vector m_DeviceBuffers; int m_InputBindingIndex = -1; cudaStream_t m_CudaStream = nullptr; + std::map m_tensorNames; virtual std::vector decodeTensor(const int imageIdx, const int imageH, const int imageW, const TensorInfo& tensor) = 0; From b3bcfb36de6307f97027082e402bc53a37961055 Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Thu, 3 Oct 2024 00:05:29 +0300 Subject: [PATCH 2/3] TensorRT 10 is supported, YOLOv11, YOLOv11-obb and YOLOv11-seg detector worked with TensorRT --- README.md | 4 + data/settings_yolov11.ini | 142 + data/settings_yolov11_obb.ini | 142 + data/settings_yolov11_seg.ini | 142 + example/examples.h | 5 +- src/Detector/OCVDNNDetector.cpp | 7 +- src/Detector/OCVDNNDetector.h | 5 +- src/Detector/YoloTensorRTDetector.cpp | 5 +- src/Detector/tensorrt_yolo/CMakeLists.txt | 13 +- src/Detector/tensorrt_yolo/YoloONNX.cpp | 6 +- src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp | 111 + .../tensorrt_yolo/YoloONNXv11_instance.hpp | 301 ++ .../tensorrt_yolo/YoloONNXv11_obb.hpp | 124 + src/Detector/tensorrt_yolo/class_detector.cpp | 23 +- src/Detector/tensorrt_yolo/class_detector.h | 5 +- .../tensorrt_yolo/cmake/FindTensorRT.cmake | 143 +- .../{sampleEngines.cpp_ => sampleEngines.cpp} | 6 +- ...mpleInference.cpp_ => sampleInference.cpp} | 0 .../common_deprecated/BatchStream.h | 388 -- .../common_deprecated/EntropyCalibrator.h | 134 - .../common_deprecated/ErrorRecorder.h | 137 - .../tensorrt_yolo/common_deprecated/buffers.h | 478 -- .../tensorrt_yolo/common_deprecated/common.h | 963 ---- .../tensorrt_yolo/common_deprecated/half.h | 4302 ----------------- .../common_deprecated/logger.cpp | 40 - .../tensorrt_yolo/common_deprecated/logger.h | 36 - .../tensorrt_yolo/common_deprecated/logging.h | 578 --- .../common_deprecated/parserOnnxConfig.h | 153 - .../common_deprecated/safeCommon.h | 71 - .../common_deprecated/sampleConfig.h | 337 -- .../common_deprecated/sampleDevice.h | 494 -- .../common_deprecated/sampleEngines.cpp | 1629 ------- .../common_deprecated/sampleEngines.h | 183 - .../common_deprecated/sampleInference.cpp | 990 ---- .../common_deprecated/sampleInference.h | 92 - .../common_deprecated/sampleOptions.cpp | 1778 ------- .../common_deprecated/sampleOptions.h | 355 -- .../common_deprecated/sampleReporting.cpp | 445 -- .../common_deprecated/sampleReporting.h | 222 - .../common_deprecated/sampleUtils.h | 543 --- src/Detector/tensorrt_yolo/ds_image.cpp | 6 +- 41 files changed, 1121 insertions(+), 14417 deletions(-) create mode 100644 data/settings_yolov11.ini create mode 100644 data/settings_yolov11_obb.ini create mode 100644 data/settings_yolov11_seg.ini create mode 100644 src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp create mode 100644 src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp create mode 100644 src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp rename src/Detector/tensorrt_yolo/common/{sampleEngines.cpp_ => sampleEngines.cpp} (99%) rename src/Detector/tensorrt_yolo/common/{sampleInference.cpp_ => sampleInference.cpp} (100%) delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/buffers.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/common.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/half.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logger.cpp delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logger.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/logging.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h delete mode 100644 src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h diff --git a/README.md b/README.md index 27b5fba2..a66543ca 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,10 @@ # Last changes +* TensorRT 10 is supported + +* YOLOv11, YOLOv11-obb and YOLOv11-seg detector worked with TensorRT! Export pretrained Pytorch models [here (ultralytics/ultralytics)](https://github.com/ultralytics/ultralytics) to onnx format and run Multitarget-tracker with -e=6 example + * YOLOv8-obb detector worked with TensorRT! Export pretrained Pytorch models [here (ultralytics/ultralytics)](https://github.com/ultralytics/ultralytics) to onnx format and run Multitarget-tracker with -e=6 example * YOLOv10 detector worked with TensorRT! Export pretrained Pytorch models [here (THU-MIG/yolov10)](https://github.com/THU-MIG/yolov10) to onnx format and run Multitarget-tracker with -e=6 example diff --git a/data/settings_yolov11.ini b/data/settings_yolov11.ini new file mode 100644 index 00000000..c82412cd --- /dev/null +++ b/data/settings_yolov11.ini @@ -0,0 +1,142 @@ +[detection] + +#----------------------------- +# opencv_dnn = 12 +# darknet_cudnn = 10 +# tensorrt = 11 +detector_backend = 12 + +#----------------------------- +# Target and backend for opencv_dnn detector +# DNN_TARGET_CPU +# DNN_TARGET_OPENCL +# DNN_TARGET_OPENCL_FP16 +# DNN_TARGET_MYRIAD +# DNN_TARGET_CUDA +# DNN_TARGET_CUDA_FP16 +ocv_dnn_target = DNN_TARGET_CPU + +# DNN_BACKEND_DEFAULT +# DNN_BACKEND_HALIDE +# DNN_BACKEND_INFERENCE_ENGINE +# DNN_BACKEND_OPENCV +# DNN_BACKEND_VKCOM +# DNN_BACKEND_CUDA +# DNN_BACKEND_INFERENCE_ENGINE_NGRAPH +# DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 +ocv_dnn_backend = DNN_BACKEND_OPENCV + +#----------------------------- +nn_weights = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s.onnx +nn_config = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s.onnx +class_names = C:/work/home/mtracker/Multitarget-tracker/data/coco/coco.names + +#----------------------------- +confidence_threshold = 0.3 + +max_crop_ratio = 0 +max_batch = 1 +gpu_id = 0 + +#----------------------------- +# YOLOV3 +# YOLOV4 +# YOLOV5 +net_type = YOLOV11 + +#----------------------------- +# INT8 +# FP16 +# FP32 +inference_precision = FP16 + + +[tracking] + +#----------------------------- +# DistCenters = 0 // Euclidean distance between centers, pixels +# DistRects = 1 // Euclidean distance between bounding rectangles, pixels +# DistJaccard = 2 // Intersection over Union, IoU, [0, 1] +# DistHist = 3 // Bhatacharia distance between histograms, [0, 1] + +distance_type = 0 + +#----------------------------- +# KalmanLinear = 0 +# KalmanUnscented = 1 + +kalman_type = 0 + +#----------------------------- +# FilterCenter = 0 +# FilterRect = 1 +# FilterRRect = 2 + +filter_goal = 0 + +#----------------------------- +# TrackNone = 0 +# TrackKCF = 1 +# TrackMIL = 2 +# TrackMedianFlow = 3 +# TrackGOTURN = 4 +# TrackMOSSE = 5 +# TrackCSRT = 6 +# TrackDAT = 7 +# TrackSTAPLE = 8 +# TrackLDES = 9 +# TrackDaSiamRPN = 10 +# Used if filter_goal == FilterRect + +lost_track_type = 0 + +#----------------------------- +# MatchHungrian = 0 +# MatchBipart = 1 + +match_type = 0 + +#----------------------------- +# Use constant acceleration motion model: +# 0 - unused (stable) +# 1 - use acceleration in Kalman filter (experimental) +use_aceleration = 0 + +#----------------------------- +# Delta time for Kalman filter +delta_time = 0.4 + +#----------------------------- +# Accel noise magnitude for Kalman filter +accel_noise = 0.2 + +#----------------------------- +# Distance threshold between region and object on two frames +dist_thresh = 0.8 + +#----------------------------- +# If this value > 0 than will be used circle with this radius +# If this value <= 0 than will be used ellipse with size (3*vx, 3*vy), vx and vy - horizontal and vertical speed in pixelsa +min_area_radius_pix = -1 + +#----------------------------- +# Minimal area radius in ration for object size. Used if min_area_radius_pix < 0 +min_area_radius_k = 0.8 + +#----------------------------- +# If the object do not assignment more than this frames then it will be removed +max_skip_frames = 50 + +#----------------------------- +# The maximum trajectory length +max_trace_len = 50 + +#----------------------------- +# Detection abandoned objects +detect_abandoned = 0 +# After this time (in seconds) the object is considered abandoned +min_static_time = 5 +# After this time (in seconds) the abandoned object will be removed +max_static_time = 25 +# Speed in pixels. If speed of object is more that this value than object is non static +max_speed_for_static = 10 diff --git a/data/settings_yolov11_obb.ini b/data/settings_yolov11_obb.ini new file mode 100644 index 00000000..599e5dd5 --- /dev/null +++ b/data/settings_yolov11_obb.ini @@ -0,0 +1,142 @@ +[detection] + +#----------------------------- +# opencv_dnn = 12 +# darknet_cudnn = 10 +# tensorrt = 11 +detector_backend = 12 + +#----------------------------- +# Target and backend for opencv_dnn detector +# DNN_TARGET_CPU +# DNN_TARGET_OPENCL +# DNN_TARGET_OPENCL_FP16 +# DNN_TARGET_MYRIAD +# DNN_TARGET_CUDA +# DNN_TARGET_CUDA_FP16 +ocv_dnn_target = DNN_TARGET_CPU + +# DNN_BACKEND_DEFAULT +# DNN_BACKEND_HALIDE +# DNN_BACKEND_INFERENCE_ENGINE +# DNN_BACKEND_OPENCV +# DNN_BACKEND_VKCOM +# DNN_BACKEND_CUDA +# DNN_BACKEND_INFERENCE_ENGINE_NGRAPH +# DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 +ocv_dnn_backend = DNN_BACKEND_OPENCV + +#----------------------------- +nn_weights = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s-obb.onnx +nn_config = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s-obb.onnx +class_names = C:/work/home/mtracker/Multitarget-tracker/data/DOTA.names + +#----------------------------- +confidence_threshold = 0.3 + +max_crop_ratio = 0 +max_batch = 1 +gpu_id = 0 + +#----------------------------- +# YOLOV3 +# YOLOV4 +# YOLOV5 +net_type = YOLOV11_OBB + +#----------------------------- +# INT8 +# FP16 +# FP32 +inference_precision = FP16 + + +[tracking] + +#----------------------------- +# DistCenters = 0 // Euclidean distance between centers, pixels +# DistRects = 1 // Euclidean distance between bounding rectangles, pixels +# DistJaccard = 2 // Intersection over Union, IoU, [0, 1] +# DistHist = 3 // Bhatacharia distance between histograms, [0, 1] + +distance_type = 0 + +#----------------------------- +# KalmanLinear = 0 +# KalmanUnscented = 1 + +kalman_type = 0 + +#----------------------------- +# FilterCenter = 0 +# FilterRect = 1 +# FilterRRect = 2 + +filter_goal = 0 + +#----------------------------- +# TrackNone = 0 +# TrackKCF = 1 +# TrackMIL = 2 +# TrackMedianFlow = 3 +# TrackGOTURN = 4 +# TrackMOSSE = 5 +# TrackCSRT = 6 +# TrackDAT = 7 +# TrackSTAPLE = 8 +# TrackLDES = 9 +# TrackDaSiamRPN = 10 +# Used if filter_goal == FilterRect + +lost_track_type = 0 + +#----------------------------- +# MatchHungrian = 0 +# MatchBipart = 1 + +match_type = 0 + +#----------------------------- +# Use constant acceleration motion model: +# 0 - unused (stable) +# 1 - use acceleration in Kalman filter (experimental) +use_aceleration = 0 + +#----------------------------- +# Delta time for Kalman filter +delta_time = 0.4 + +#----------------------------- +# Accel noise magnitude for Kalman filter +accel_noise = 0.2 + +#----------------------------- +# Distance threshold between region and object on two frames +dist_thresh = 0.8 + +#----------------------------- +# If this value > 0 than will be used circle with this radius +# If this value <= 0 than will be used ellipse with size (3*vx, 3*vy), vx and vy - horizontal and vertical speed in pixelsa +min_area_radius_pix = -1 + +#----------------------------- +# Minimal area radius in ration for object size. Used if min_area_radius_pix < 0 +min_area_radius_k = 0.8 + +#----------------------------- +# If the object do not assignment more than this frames then it will be removed +max_skip_frames = 50 + +#----------------------------- +# The maximum trajectory length +max_trace_len = 50 + +#----------------------------- +# Detection abandoned objects +detect_abandoned = 0 +# After this time (in seconds) the object is considered abandoned +min_static_time = 5 +# After this time (in seconds) the abandoned object will be removed +max_static_time = 25 +# Speed in pixels. If speed of object is more that this value than object is non static +max_speed_for_static = 10 diff --git a/data/settings_yolov11_seg.ini b/data/settings_yolov11_seg.ini new file mode 100644 index 00000000..cb5c83ea --- /dev/null +++ b/data/settings_yolov11_seg.ini @@ -0,0 +1,142 @@ +[detection] + +#----------------------------- +# opencv_dnn = 12 +# darknet_cudnn = 10 +# tensorrt = 11 +detector_backend = 12 + +#----------------------------- +# Target and backend for opencv_dnn detector +# DNN_TARGET_CPU +# DNN_TARGET_OPENCL +# DNN_TARGET_OPENCL_FP16 +# DNN_TARGET_MYRIAD +# DNN_TARGET_CUDA +# DNN_TARGET_CUDA_FP16 +ocv_dnn_target = DNN_TARGET_CPU + +# DNN_BACKEND_DEFAULT +# DNN_BACKEND_HALIDE +# DNN_BACKEND_INFERENCE_ENGINE +# DNN_BACKEND_OPENCV +# DNN_BACKEND_VKCOM +# DNN_BACKEND_CUDA +# DNN_BACKEND_INFERENCE_ENGINE_NGRAPH +# DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 +ocv_dnn_backend = DNN_BACKEND_OPENCV + +#----------------------------- +nn_weights = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s-seg.onnx +nn_config = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s-seg.onnx +class_names = C:/work/home/mtracker/Multitarget-tracker/data/coco/coco.names + +#----------------------------- +confidence_threshold = 0.3 + +max_crop_ratio = 0 +max_batch = 1 +gpu_id = 0 + +#----------------------------- +# YOLOV3 +# YOLOV4 +# YOLOV5 +net_type = YOLOV11Mask + +#----------------------------- +# INT8 +# FP16 +# FP32 +inference_precision = FP16 + + +[tracking] + +#----------------------------- +# DistCenters = 0 // Euclidean distance between centers, pixels +# DistRects = 1 // Euclidean distance between bounding rectangles, pixels +# DistJaccard = 2 // Intersection over Union, IoU, [0, 1] +# DistHist = 3 // Bhatacharia distance between histograms, [0, 1] + +distance_type = 0 + +#----------------------------- +# KalmanLinear = 0 +# KalmanUnscented = 1 + +kalman_type = 0 + +#----------------------------- +# FilterCenter = 0 +# FilterRect = 1 +# FilterRRect = 2 + +filter_goal = 0 + +#----------------------------- +# TrackNone = 0 +# TrackKCF = 1 +# TrackMIL = 2 +# TrackMedianFlow = 3 +# TrackGOTURN = 4 +# TrackMOSSE = 5 +# TrackCSRT = 6 +# TrackDAT = 7 +# TrackSTAPLE = 8 +# TrackLDES = 9 +# TrackDaSiamRPN = 10 +# Used if filter_goal == FilterRect + +lost_track_type = 0 + +#----------------------------- +# MatchHungrian = 0 +# MatchBipart = 1 + +match_type = 0 + +#----------------------------- +# Use constant acceleration motion model: +# 0 - unused (stable) +# 1 - use acceleration in Kalman filter (experimental) +use_aceleration = 0 + +#----------------------------- +# Delta time for Kalman filter +delta_time = 0.4 + +#----------------------------- +# Accel noise magnitude for Kalman filter +accel_noise = 0.2 + +#----------------------------- +# Distance threshold between region and object on two frames +dist_thresh = 0.8 + +#----------------------------- +# If this value > 0 than will be used circle with this radius +# If this value <= 0 than will be used ellipse with size (3*vx, 3*vy), vx and vy - horizontal and vertical speed in pixelsa +min_area_radius_pix = -1 + +#----------------------------- +# Minimal area radius in ration for object size. Used if min_area_radius_pix < 0 +min_area_radius_k = 0.8 + +#----------------------------- +# If the object do not assignment more than this frames then it will be removed +max_skip_frames = 50 + +#----------------------------- +# The maximum trajectory length +max_trace_len = 50 + +#----------------------------- +# Detection abandoned objects +detect_abandoned = 0 +# After this time (in seconds) the object is considered abandoned +min_static_time = 5 +# After this time (in seconds) the abandoned object will be removed +max_static_time = 25 +# Speed in pixels. If speed of object is more that this value than object is non static +max_speed_for_static = 10 diff --git a/example/examples.h b/example/examples.h index 1be76399..08b0fc67 100644 --- a/example/examples.h +++ b/example/examples.h @@ -652,7 +652,10 @@ class YoloTensorRTExample final : public VideoExample YOLOV8_OBB, YOLOv8Mask, YOLOv9, - YOLOv10 + YOLOv10, + YOLOv11, + YOLOv11_OBB, + YOLOv11Mask }; YOLOModels usedModel = YOLOModels::YOLOv9; switch (usedModel) diff --git a/src/Detector/OCVDNNDetector.cpp b/src/Detector/OCVDNNDetector.cpp index 01d1102f..3da65967 100644 --- a/src/Detector/OCVDNNDetector.cpp +++ b/src/Detector/OCVDNNDetector.cpp @@ -142,6 +142,9 @@ bool OCVDNNDetector::Init(const config_t& config) dictNetType["YOLOV8Mask"] = ModelType::YOLOV8Mask; dictNetType["YOLOV9"] = ModelType::YOLOV9; dictNetType["YOLOV10"] = ModelType::YOLOV10; + dictNetType["YOLOV11"] = ModelType::YOLOV11; + dictNetType["YOLOV11_OBB"] = ModelType::YOLOV11_OBB; + dictNetType["YOLOV11Mask"] = ModelType::YOLOV11Mask; auto netType = dictNetType.find(net_type->second); if (netType != dictNetType.end()) @@ -348,7 +351,7 @@ void OCVDNNDetector::DetectInCrop(const cv::UMat& colorFrame, const cv::Rect& cr } else { - if (m_netType == ModelType::YOLOV8 || m_netType == ModelType::YOLOV5 || m_netType == ModelType::YOLOV9 || m_netType == ModelType::YOLOV10) + if (m_netType == ModelType::YOLOV8 || m_netType == ModelType::YOLOV5 || m_netType == ModelType::YOLOV9 || m_netType == ModelType::YOLOV10 || m_netType == ModelType::YOLOV11) { int rows = detections[0].size[1]; int dimensions = detections[0].size[2]; @@ -370,7 +373,7 @@ void OCVDNNDetector::DetectInCrop(const cv::UMat& colorFrame, const cv::Rect& cr for (int i = 0; i < rows; ++i) { - if (m_netType == ModelType::YOLOV8 || m_netType == ModelType::YOLOV9) + if (m_netType == ModelType::YOLOV8 || m_netType == ModelType::YOLOV9 || m_netType == ModelType::YOLOV11) { float* classes_scores = data + 4; diff --git a/src/Detector/OCVDNNDetector.h b/src/Detector/OCVDNNDetector.h index 79842ba2..44d91b4d 100644 --- a/src/Detector/OCVDNNDetector.h +++ b/src/Detector/OCVDNNDetector.h @@ -42,7 +42,10 @@ class OCVDNNDetector final : public BaseDetector YOLOV8_OBB, YOLOV8Mask, YOLOV9, - YOLOV10 + YOLOV10, + YOLOV11, + YOLOV11_OBB, + YOLOV11Mask }; cv::dnn::Net m_net; diff --git a/src/Detector/YoloTensorRTDetector.cpp b/src/Detector/YoloTensorRTDetector.cpp index a0ebeb44..d1cfb352 100644 --- a/src/Detector/YoloTensorRTDetector.cpp +++ b/src/Detector/YoloTensorRTDetector.cpp @@ -107,6 +107,9 @@ bool YoloTensorRTDetector::Init(const config_t& config) dictNetType["YOLOV8Mask"] = tensor_rt::YOLOV8Mask; dictNetType["YOLOV9"] = tensor_rt::YOLOV9; dictNetType["YOLOV10"] = tensor_rt::YOLOV10; + dictNetType["YOLOV11"] = tensor_rt::YOLOV11; + dictNetType["YOLOV11_OBB"] = tensor_rt::YOLOV11_OBB; + dictNetType["YOLOV11Mask"] = tensor_rt::YOLOV11Mask; auto netType = dictNetType.find(net_type->second); if (netType != dictNetType.end()) @@ -298,7 +301,7 @@ void YoloTensorRTDetector::Detect(const std::vector& frames, std::vect /// void YoloTensorRTDetector::CalcMotionMap(cv::Mat& frame) { - if (m_localConfig.net_type == tensor_rt::YOLOV7Mask || m_localConfig.net_type == tensor_rt::YOLOV8Mask) + if (m_localConfig.net_type == tensor_rt::YOLOV7Mask || m_localConfig.net_type == tensor_rt::YOLOV8Mask || m_localConfig.net_type == tensor_rt::YOLOV11Mask) { static std::vector color; if (color.empty()) diff --git a/src/Detector/tensorrt_yolo/CMakeLists.txt b/src/Detector/tensorrt_yolo/CMakeLists.txt index 30f916bf..d09a2243 100644 --- a/src/Detector/tensorrt_yolo/CMakeLists.txt +++ b/src/Detector/tensorrt_yolo/CMakeLists.txt @@ -43,7 +43,7 @@ SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) find_package(CUDNN REQUIRED) find_package(TensorRT REQUIRED) -message("TensorRT major version: " ${TensorRT_VERSION_MAJOR}) +message("TensorRT version: " ${TensorRT_VERSION}) include_directories(${OpenCV_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS}) @@ -64,11 +64,8 @@ cuda_add_library(${libname_rt} SHARED #message("${OpenCV_LIBS}") #message(${OpenCV_DIR}) -if (MSVC) - file(GLOB TensorRT_LIBRARIES ${TensorRT_LIBRARY}) -endif() +set(TensorRT_LIBRARIES ${TensorRT_LIBRARIES} ${TensorRT_nvinfer_LIBRARY} ${TensorRT_nvinfer_plugin_LIBRARY} ${TensorRT_nvonnxparser_LIBRARY}) -message("TensorRT_LIBRARY: ${TensorRT_LIBRARY}") message("TensorRT_LIBRARIES: ${TensorRT_LIBRARIES}") @@ -84,9 +81,11 @@ set(TENSORRT_LIBS ${TensorRT_LIBRARIES}) if (CMAKE_COMPILER_IS_GNUCXX) - set(TENSORRT_LIBS ${TENSORRT_LIBS} stdc++fs nvinfer_plugin nvonnxparser) + set(TENSORRT_LIBS ${TENSORRT_LIBS} stdc++fs) endif(CMAKE_COMPILER_IS_GNUCXX) +message("TENSORRT_LIBS: ${TENSORRT_LIBS}") + target_link_libraries(${libname_rt} ${TENSORRT_LIBS}) install(TARGETS ${libname_rt} @@ -96,4 +95,4 @@ install(TARGETS ${libname_rt} RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/include/${PROJECT_NAME}) -set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER "libs") \ No newline at end of file +set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER "libs") diff --git a/src/Detector/tensorrt_yolo/YoloONNX.cpp b/src/Detector/tensorrt_yolo/YoloONNX.cpp index 0b19d5cc..3ea99ec4 100644 --- a/src/Detector/tensorrt_yolo/YoloONNX.cpp +++ b/src/Detector/tensorrt_yolo/YoloONNX.cpp @@ -1,5 +1,7 @@ #include +#define DEFINE_TRT_ENTRYPOINTS 1 + #include "YoloONNX.hpp" #include "trt_utils.h" #include "../../common/defines.h" @@ -164,9 +166,9 @@ bool YoloONNX::ConstructNetwork(YoloONNXUniquePtr& builder, size_t dlaManagedSRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM); size_t dlaLocalDRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM); size_t dlaGlobalDRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM); - std::cout << "workspaceSize = " << workspaceSize << ", dlaManagedSRAMSize = " << dlaManagedSRAMSize << ", dlaLocalDRAMSize = " << dlaLocalDRAMSize << ", dlaGlobalDRAMSize = " << dlaGlobalDRAMSize << std::endl; + std::cout << "m_params.videoMemory = " << m_params.videoMemory << ", workspaceSize = " << workspaceSize << ", dlaManagedSRAMSize = " << dlaManagedSRAMSize << ", dlaLocalDRAMSize = " << dlaLocalDRAMSize << ", dlaGlobalDRAMSize = " << dlaGlobalDRAMSize << std::endl; - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : (1 << 20)); + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : workspaceSize); #endif config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp new file mode 100644 index 00000000..9103bfa6 --- /dev/null +++ b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp @@ -0,0 +1,111 @@ +#pragma once + +#include "YoloONNX.hpp" + +/// +/// \brief The YOLOv11_bb_onnx class +/// +class YOLOv11_bb_onnx : public YoloONNX +{ +protected: + /// + /// \brief GetResult + /// \param output + /// \return + /// + std::vector GetResult(size_t imgIdx, int /*keep_topk*/, const std::vector& outputs, cv::Size frameSize) + { + std::vector resBoxes; + + //0: name: images, size: 1x3x640x640 + //1: name: output0, size: 1x84x8400 + + const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); + const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + + auto output = outputs[0]; + + size_t ncInd = 1; + size_t lenInd = 2; + int nc = m_outpuDims[0].d[ncInd] - 4; + int dimensions = nc + 4; + size_t len = static_cast(m_outpuDims[0].d[lenInd]) / m_params.explicitBatchSize; + //auto Volume = [](const nvinfer1::Dims& d) + //{ + // return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); + //}; + auto volume = len * m_outpuDims[0].d[ncInd]; // Volume(m_outpuDims[0]); + output += volume * imgIdx; + //std::cout << "len = " << len << ", nc = " << nc << ", m_params.confThreshold = " << m_params.confThreshold << ", volume = " << volume << std::endl; + + cv::Mat rawMemory(1, dimensions * static_cast(len), CV_32FC1, output); + rawMemory = rawMemory.reshape(1, dimensions); + cv::transpose(rawMemory, rawMemory); + output = (float*)rawMemory.data; + + //std::cout << "output[0] mem:\n"; + //for (size_t ii = 0; ii < 100; ++ii) + //{ + // std::cout << ii << ": "; + // for (size_t jj = 0; jj < 20; ++jj) + // { + // std::cout << output[ii * 20 + jj] << " "; + // } + // std::cout << ";" << std::endl; + //} + //std::cout << ";" << std::endl; + + std::vector classIds; + std::vector confidences; + std::vector rectBoxes; + classIds.reserve(len); + confidences.reserve(len); + rectBoxes.reserve(len); + + for (size_t i = 0; i < len; ++i) + { + // Box + size_t k = i * (nc + 4); + + int classId = -1; + float objectConf = 0.f; + for (int j = 0; j < nc; ++j) + { + const float classConf = output[k + 4 + j]; + if (classConf > objectConf) + { + classId = j; + objectConf = classConf; + } + } + + //if (i == 0) + // std::cout << i << ": object_conf = " << object_conf << ", class_conf = " << class_conf << ", classId = " << classId << ", rect = " << cv::Rect(cvRound(x), cvRound(y), cvRound(width), cvRound(height)) << std::endl; + + if (objectConf >= m_params.confThreshold) + { + classIds.push_back(classId); + confidences.push_back(objectConf); + + // (center x, center y, width, height) to (x, y, w, h) + float x = fw * (output[k] - output[k + 2] / 2); + float y = fh * (output[k + 1] - output[k + 3] / 2); + float width = fw * output[k + 2]; + float height = fh * output[k + 3]; + rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height)); + } + } + + // Non-maximum suppression to eliminate redudant overlapping boxes + std::vector indices; + cv::dnn::NMSBoxes(rectBoxes, confidences, m_params.confThreshold, m_params.nmsThreshold, indices); + resBoxes.reserve(indices.size()); + + for (size_t bi = 0; bi < indices.size(); ++bi) + { + resBoxes.emplace_back(classIds[indices[bi]], confidences[indices[bi]], rectBoxes[indices[bi]]); + } + + return resBoxes; + } +}; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp new file mode 100644 index 00000000..ea6ea2a2 --- /dev/null +++ b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp @@ -0,0 +1,301 @@ +#pragma once + +#include "YoloONNX.hpp" + +/// +/// \brief The YOLOv11_instance_onnx class +/// +class YOLOv11_instance_onnx : public YoloONNX +{ +protected: + /// + /// \brief GetResult + /// \param output + /// \return + /// + std::vector GetResult(size_t imgIdx, int /*keep_topk*/, const std::vector& outputs, cv::Size frameSize) + { + std::vector resBoxes; + + const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); + const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + + size_t outInd = (outputs.size() == 0) ? 0 : 1; + size_t segInd = (outputs.size() == 0) ? 1 : 0; + + auto output = outputs[0]; + + //std::cout << "output[1] mem:\n"; + //auto output1 = outputs[1]; + //for (size_t ii = 0; ii < 100; ++ii) + //{ + // std::cout << ii << ": "; + // for (size_t jj = 0; jj < 20; ++jj) + // { + // std::cout << output1[ii * 20 + jj] << " "; + // } + // std::cout << ";" << std::endl; + //} + //std::cout << ";" << std::endl; + + //0: name: images, size: 1x3x640x640 + //1: name: output1, size: 1x32x160x160 + //2: name: output0, size: 1x116x8400 + // 25200 = 3x80x80 + 3x40x40 + 3x20x20 + // 116 = x, y, w, h, 80 classes, 32 seg ancors + // 80 * 8 = 640, 40 * 16 = 640, 20 * 32 = 640 + + size_t ncInd = 1; + size_t lenInd = 2; + int nc = m_outpuDims[outInd].d[ncInd] - 4 - 32; + int dimensions = nc + 32 + 4; + size_t len = static_cast(m_outpuDims[outInd].d[lenInd]) / m_params.explicitBatchSize; + //auto Volume = [](const nvinfer1::Dims& d) + //{ + // return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); + //}; + auto volume = len * m_outpuDims[outInd].d[ncInd]; // Volume(m_outpuDims[0]); + output += volume * imgIdx; + //std::cout << "len = " << len << ", nc = " << nc << ", m_params.confThreshold = " << m_params.confThreshold << ", volume = " << volume << std::endl; + + cv::Mat rawMemory(1, dimensions * static_cast(len), CV_32FC1, output); + rawMemory = rawMemory.reshape(1, dimensions); + cv::transpose(rawMemory, rawMemory); + output = (float*)rawMemory.data; + + //std::cout << "output[0] mem:\n"; + //for (size_t ii = 0; ii < 100; ++ii) + //{ + // std::cout << ii << ": "; + // for (size_t jj = 0; jj < 20; ++jj) + // { + // std::cout << output[ii * 20 + jj] << " "; + // } + // std::cout << ";" << std::endl; + //} + //std::cout << ";" << std::endl; + +#if 1 + int segWidth = 160; + int segHeight = 160; + int segChannels = 32; + + if (outputs.size() > 1) + { + //std::cout << "output1 nbDims: " << m_outpuDims[segInd].nbDims << ", "; + //for (size_t i = 0; i < m_outpuDims[segInd].nbDims; ++i) + //{ + // std::cout << m_outpuDims[segInd].d[i]; + // if (i + 1 != m_outpuDims[segInd].nbDims) + // std::cout << "x"; + //} + //std::cout << std::endl; + //std::cout << "output nbDims: " << m_outpuDims[outInd].nbDims << ", "; + //for (size_t i = 0; i < m_outpuDims[outInd].nbDims; ++i) + //{ + // std::cout << m_outpuDims[outInd].d[i]; + // if (i + 1 != m_outpuDims[outInd].nbDims) + // std::cout << "x"; + //} + //std::cout << std::endl; + + segChannels = m_outpuDims[segInd].d[1]; + segWidth = m_outpuDims[segInd].d[2]; + segHeight = m_outpuDims[segInd].d[3]; + } + cv::Mat maskProposals; + std::vector> picked_proposals; + int net_width = nc + 4 + segChannels; +#endif + + std::vector classIds; + std::vector confidences; + std::vector rectBoxes; + classIds.reserve(len); + confidences.reserve(len); + rectBoxes.reserve(len); + + for (size_t i = 0; i < len; ++i) + { + // Box + size_t k = i * (nc + 4 + 32); + + int classId = -1; + float objectConf = 0.f; + for (int j = 0; j < nc; ++j) + { + const float classConf = output[k + 4 + j]; + if (classConf > objectConf) + { + classId = j; + objectConf = classConf; + } + } + + //if (i == 0) + //{ + // std::cout << "without nms: mem" << i << ": "; + // for (size_t ii = 0; ii < 4; ++ii) + // { + // std::cout << output[k + ii] << " "; + // } + // std::cout << ";" << std::endl; + // for (size_t ii = 4; ii < nc + 4; ++ii) + // { + // std::cout << output[k + ii] << " "; + // } + // std::cout << ";" << std::endl; + // for (size_t ii = nc + 4; ii < nc + 4 + 32; ++ii) + // { + // std::cout << output[k + ii] << " "; + // } + // std::cout << ";" << std::endl; + //} + + if (objectConf >= m_params.confThreshold) + { + // (center x, center y, width, height) to (x, y, w, h) + float x = fw * (output[k] - output[k + 2] / 2); + float y = fh * (output[k + 1] - output[k + 3] / 2); + float width = fw * output[k + 2]; + float height = fh * output[k + 3]; + + //auto ClampToFrame = [](float& v, float& size, int hi) -> int + //{ + // int res = 0; +// + // if (size < 1) + // size = 0; +// + // if (v < 0) + // { + // res = v; + // v = 0; + // return res; + // } + // else if (v + size > hi - 1) + // { + // res = v; + // v = hi - 1 - size; + // if (v < 0) + // { + // size += v; + // v = 0; + // } + // res -= v; + // return res; + // } + // return res; + //}; + //ClampToFrame(x, width, frameSize.width); + //ClampToFrame(y, height, frameSize.height); + + //if (i == 0) + // std::cout << i << ": object_conf = " << object_conf << ", class_conf = " << class_conf << ", classId = " << classId << ", rect = " << cv::Rect(cvRound(x), cvRound(y), cvRound(width), cvRound(height)) << std::endl; + + if (width > 4 && height > 4) + { + classIds.push_back(classId); + confidences.push_back(objectConf); + rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height)); + + std::vector temp_proto(output + k + 4 + nc, output + k + net_width); + picked_proposals.push_back(temp_proto); + } + } + } + + // Non-maximum suppression to eliminate redudant overlapping boxes + std::vector indices; + cv::dnn::NMSBoxes(rectBoxes, confidences, m_params.confThreshold, m_params.nmsThreshold, indices); + resBoxes.reserve(indices.size()); + + for (size_t bi = 0; bi < indices.size(); ++bi) + { + resBoxes.emplace_back(classIds[indices[bi]], confidences[indices[bi]], Clamp(rectBoxes[indices[bi]], frameSize)); + maskProposals.push_back(cv::Mat(picked_proposals[indices[bi]]).t()); + } + + if (!maskProposals.empty()) + { + // Mask processing + const float* pdata = outputs[1]; + std::vector maskFloat(pdata, pdata + segChannels * segWidth * segHeight); + + int INPUT_W = m_inputDims.d[3]; + int INPUT_H = m_inputDims.d[2]; + static constexpr float MASK_THRESHOLD = 0.5; + + cv::Mat mask_protos = cv::Mat(maskFloat); + cv::Mat protos = mask_protos.reshape(0, { segChannels, segWidth * segHeight }); + + cv::Mat matmulRes = (maskProposals * protos).t();//n*32 32*25600 + cv::Mat masks = matmulRes.reshape(static_cast(resBoxes.size()), { segWidth, segHeight }); + std::vector maskChannels; + split(masks, maskChannels); + for (size_t i = 0; i < resBoxes.size(); ++i) + { + cv::Mat dest; + cv::Mat mask; + //sigmoid + cv::exp(-maskChannels[i], dest); + dest = 1.0 / (1.0 + dest);//160*160 + + int padw = 0; + int padh = 0; + cv::Rect roi(int((float)padw / INPUT_W * segWidth), int((float)padh / INPUT_H * segHeight), int(segWidth - padw / 2), int(segHeight - padh / 2)); + dest = dest(roi); + + cv::resize(dest, mask, frameSize, cv::INTER_NEAREST); + + resBoxes[i].m_boxMask = mask(resBoxes[i].m_brect) > MASK_THRESHOLD; + +#if 0 + static int globalObjInd = 0; + SaveMat(resBoxes[i].m_boxMask, std::to_string(globalObjInd++), ".png", "tmp", true); +#endif + + std::vector> contours; + std::vector hierarchy; +#if (CV_VERSION_MAJOR < 4) + cv::findContours(resBoxes[i].m_boxMask, contours, hierarchy, CV_RETR_EXTERNAL, CV_CHAIN_APPROX_SIMPLE, cv::Point()); +#else + cv::findContours(resBoxes[i].m_boxMask, contours, hierarchy, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE, cv::Point()); +#endif + for (const auto& contour : contours) + { + cv::Rect br = cv::boundingRect(contour); + + if (br.width >= 4 && + br.height >= 4) + { + cv::RotatedRect rr = (contour.size() < 5) ? cv::minAreaRect(contour) : cv::fitEllipse(contour); + + br.x += resBoxes[i].m_brect.x; + br.y += resBoxes[i].m_brect.y; + rr.center.x += resBoxes[i].m_brect.x; + rr.center.y += resBoxes[i].m_brect.y; + + //std::cout << "rr: " << rr.center << ", " << rr.angle << ", " << rr.size << std::endl; + + if (resBoxes[i].m_boxMask.size() != br.size()) + { + br.width = resBoxes[i].m_boxMask.cols; + br.height = resBoxes[i].m_boxMask.rows; + if (br.x + br.width >= frameSize.width) + br.x = frameSize.width - br.width; + if (br.y + br.height >= frameSize.height) + br.y = frameSize.height - br.height; + } + + resBoxes[i].m_brect = br; + resBoxes[i].m_rrect = rr; + + break; + } + } + } + } + return resBoxes; + } +}; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp new file mode 100644 index 00000000..7c2b98ce --- /dev/null +++ b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp @@ -0,0 +1,124 @@ +#pragma once + +#include "YoloONNX.hpp" + +/// +/// \brief The YOLOv11_obb_onnx class +/// +class YOLOv11_obb_onnx : public YoloONNX +{ +protected: + /// + /// \brief GetResult + /// \param output + /// \return + /// + std::vector GetResult(size_t imgIdx, int /*keep_topk*/, const std::vector& outputs, cv::Size frameSize) + { + std::vector resBoxes; + + //0: name: images, size: 1x3x1024x1024 + //1: name: output0, size: 1x20x21504 + //20: 15 DOTA classes + x + y + w + h + a + constexpr int shapeDataSize = 5; + + const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); + const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + + auto output = outputs[0]; + + size_t ncInd = 1; + size_t lenInd = 2; + int nc = m_outpuDims[0].d[ncInd] - shapeDataSize; + int dimensions = nc + shapeDataSize; + size_t len = static_cast(m_outpuDims[0].d[lenInd]) / m_params.explicitBatchSize; + //auto Volume = [](const nvinfer1::Dims& d) + //{ + // return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); + //}; + auto volume = len * m_outpuDims[0].d[ncInd]; // Volume(m_outpuDims[0]); + output += volume * imgIdx; + //std::cout << "len = " << len << ", nc = " << nc << ", m_params.confThreshold = " << m_params.confThreshold << ", volume = " << volume << std::endl; + + cv::Mat rawMemory(1, dimensions * static_cast(len), CV_32FC1, output); + rawMemory = rawMemory.reshape(1, dimensions); + cv::transpose(rawMemory, rawMemory); + output = (float*)rawMemory.data; + + //std::cout << "output[0] mem:\n"; + //for (size_t ii = 0; ii < 100; ++ii) + //{ + // std::cout << ii << ": "; + // for (size_t jj = 0; jj < 20; ++jj) + // { + // std::cout << output[ii * 20 + jj] << " "; + // } + // std::cout << ";" << std::endl; + //} + //std::cout << ";" << std::endl; + + std::vector classIds; + std::vector confidences; + std::vector rectBoxes; + classIds.reserve(len); + confidences.reserve(len); + rectBoxes.reserve(len); + + for (size_t i = 0; i < len; ++i) + { + // Box + size_t k = i * (nc + shapeDataSize); + + int classId = -1; + float objectConf = 0.f; + for (int j = 0; j < nc; ++j) + { + const float classConf = output[k + 4 + j]; + if (classConf > objectConf) + { + classId = j; + objectConf = classConf; + } + } + + //if (i == 0) + //{ + // for (int jj = 0; jj < 20; ++jj) + // { + // std::cout << output[jj] << " "; + // } + // std::cout << std::endl; + //} + + if (objectConf >= m_params.confThreshold) + { + classIds.push_back(classId); + confidences.push_back(objectConf); + + // (center x, center y, width, height) + float cx = fw * output[k]; + float cy = fh * output[k + 1]; + float width = fw * output[k + 2]; + float height = fh * output[k + 3]; + float angle = 180.f * output[k + nc + shapeDataSize - 1] / M_PI; + rectBoxes.emplace_back(cv::Point2f(cx, cy), cv::Size2f(width, height), angle); + + //if (rectBoxes.size() == 1) + // std::cout << i << ": object_conf = " << objectConf << ", classId = " << classId << ", rect = " << rectBoxes.back().boundingRect() << ", angle = " << angle << std::endl; + } + } + + // Non-maximum suppression to eliminate redudant overlapping boxes + //std::vector indices; + //cv::dnn::NMSBoxes(rectBoxes, confidences, m_params.confThreshold, m_params.nmsThreshold, indices); + //resBoxes.reserve(indices.size()); + + resBoxes.reserve(rectBoxes.size()); + for (size_t bi = 0; bi < rectBoxes.size(); ++bi) + { + resBoxes.emplace_back(classIds[bi], confidences[bi], rectBoxes[bi]); + } + + return resBoxes; + } +}; diff --git a/src/Detector/tensorrt_yolo/class_detector.cpp b/src/Detector/tensorrt_yolo/class_detector.cpp index f7a18e23..895e5d96 100644 --- a/src/Detector/tensorrt_yolo/class_detector.cpp +++ b/src/Detector/tensorrt_yolo/class_detector.cpp @@ -10,6 +10,10 @@ #include "YoloONNXv8_instance.hpp" #include "YoloONNXv9_bb.hpp" #include "YoloONNXv10_bb.hpp" +#include "YoloONNXv11_bb.hpp" +#include "YoloONNXv11_obb.hpp" +#include "YoloONNXv11_instance.hpp" + namespace tensor_rt { @@ -110,6 +114,22 @@ namespace tensor_rt m_params.outputTensorNames.push_back("output0"); m_detector = std::make_unique(); break; + case ModelType::YOLOV11: + m_params.inputTensorNames.push_back("images"); + m_params.outputTensorNames.push_back("output0"); + m_detector = std::make_unique(); + break; + case ModelType::YOLOV11_OBB: + m_params.inputTensorNames.push_back("images"); + m_params.outputTensorNames.push_back("output0"); + m_detector = std::make_unique(); + break; + case ModelType::YOLOV11Mask: + m_params.inputTensorNames.push_back("images"); + m_params.outputTensorNames.push_back("output0"); + m_params.outputTensorNames.push_back("output1"); + m_detector = std::make_unique(); + break; } // Threshold values @@ -193,7 +213,8 @@ namespace tensor_rt if (config.net_type == ModelType::YOLOV6 || config.net_type == ModelType::YOLOV7 || config.net_type == ModelType::YOLOV7Mask || config.net_type == ModelType::YOLOV8 || config.net_type == ModelType::YOLOV8_OBB || config.net_type == ModelType::YOLOV8Mask || - config.net_type == ModelType::YOLOV9 || config.net_type == ModelType::YOLOV10) + config.net_type == ModelType::YOLOV9 || config.net_type == ModelType::YOLOV10 || + config.net_type == ModelType::YOLOV11 || config.net_type == ModelType::YOLOV11_OBB || config.net_type == ModelType::YOLOV11Mask) m_impl = new YoloONNXImpl(); else m_impl = new YoloDectectorImpl(); diff --git a/src/Detector/tensorrt_yolo/class_detector.h b/src/Detector/tensorrt_yolo/class_detector.h index 1dd85d70..b4da0d0a 100644 --- a/src/Detector/tensorrt_yolo/class_detector.h +++ b/src/Detector/tensorrt_yolo/class_detector.h @@ -54,7 +54,10 @@ namespace tensor_rt YOLOV8_OBB, YOLOV8Mask, YOLOV9, - YOLOV10 + YOLOV10, + YOLOV11, + YOLOV11_OBB, + YOLOV11Mask }; /// diff --git a/src/Detector/tensorrt_yolo/cmake/FindTensorRT.cmake b/src/Detector/tensorrt_yolo/cmake/FindTensorRT.cmake index 7ec8d998..b0099305 100644 --- a/src/Detector/tensorrt_yolo/cmake/FindTensorRT.cmake +++ b/src/Detector/tensorrt_yolo/cmake/FindTensorRT.cmake @@ -1,72 +1,115 @@ +# ~~~ +# Copyright 2021 Olivier Le Doeuff +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # This module defines the following variables: # -# :: +# - TensorRT_FOUND: A boolean specifying whether or not TensorRT was found. +# - TensorRT_VERSION: The exact version of TensorRT found +# - TensorRT_VERSION_MAJOR: The major version of TensorRT. +# - TensorRT_VERSION_MINOR: The minor version of TensorRT. +# - TensorRT_VERSION_PATCH: The patch version of TensorRT. +# - TensorRT_VERSION_TWEAK: The tweak version of TensorRT. +# - TensorRT_INCLUDE_DIRS: The path to TensorRT ``include`` folder containing the header files required to compile a project linking against TensorRT. +# - TensorRT_LIBRARY_DIRS: The path to TensorRT library directory that contains libraries. # -# TensorRT_INCLUDE_DIRS -# TensorRT_LIBRARIES -# TensorRT_FOUND -# -# :: -# -# TensorRT_VERSION_STRING - version (x.y.z) -# TensorRT_VERSION_MAJOR - major version (x) -# TensorRT_VERSION_MINOR - minor version (y) -# TensorRT_VERSION_PATCH - patch version (z) +# This module create following targets: +# - trt::nvinfer +# - trt::nvinfer_plugin +# - trt::nvonnxparser +# - trt::nvparsers +# This script was inspired from https://github.com/NicolasIRAGNE/CMakeScripts +# This script was inspired from https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake # # Hints # ^^^^^ # A user may set ``TensorRT_ROOT`` to an installation root to tell this module where to look. -# -set(_TensorRT_SEARCHES) +# ~~~ -if(TensorRT_ROOT) - set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_ROOT} NO_DEFAULT_PATH) - list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT) +if(NOT TensorRT_FIND_COMPONENTS) + set(TensorRT_FIND_COMPONENTS nvinfer nvinfer_plugin nvonnxparser) endif() +set(TensorRT_LIBRARIES) -# appends some common paths -set(_TensorRT_SEARCH_NORMAL - PATHS "/usr" +# find the include directory of TensorRT +find_path( + TensorRT_INCLUDE_DIR + NAMES NvInfer.h + PATHS ${TensorRT_ROOT} ENV TensorRT_ROOT + PATH_SUFFIXES include ) -list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL) - -# Include dir -foreach(search ${_TensorRT_SEARCHES}) - find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include) -endforeach() -if(NOT TensorRT_LIBRARY) - foreach(search ${_TensorRT_SEARCHES}) - find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib) - endforeach() +string(FIND ${TensorRT_INCLUDE_DIR} "NOTFOUND" _include_dir_notfound) +if(NOT _include_dir_notfound EQUAL -1) + if(TensorRT_FIND_REQUIRED) + message(FATAL_ERROR "Fail to find TensorRT, please set TensorRT_ROOT. Include path not found.") + endif() + return() endif() +set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR}) -mark_as_advanced(TensorRT_INCLUDE_DIR) - -if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h") - file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") - file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") - file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") +# Extract version of tensorrt +if(EXISTS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h") + file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") + file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") + file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") + file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_TWEAK REGEX "^#define NV_TENSORRT_BUILD [0-9]+.*$") - string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}") - string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}") - string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}") - set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}") + string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}") + string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}") + string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}") + string(REGEX REPLACE "^#define NV_TENSORRT_BUILD ([0-9]+).*$" "\\1" TensorRT_VERSION_TWEAK "${TensorRT_TWEAK}") + set(TensorRT_VERSION "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}.${TensorRT_VERSION_TWEAK}") endif() -include(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING) +function(_find_trt_component component) + + # Find library for component (ie nvinfer, nvparsers, etc...) + find_library( + TensorRT_${component}_LIBRARY + NAMES ${component} + PATHS ${TensorRT_ROOT} ${TENSORRT_LIBRARY_DIR} ENV TensorRT_ROOT + ) -if(TensorRT_FOUND) - set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR}) + string(FIND ${TensorRT_${component}_LIBRARY} "NOTFOUND" _library_not_found) - if(NOT TensorRT_LIBRARIES) - set(TensorRT_LIBRARIES ${TensorRT_LIBRARY}) + if(NOT TensorRT_LIBRARY_DIR) + get_filename_component(_path ${TensorRT_${component}_LIBRARY} DIRECTORY) + set(TensorRT_LIBRARY_DIR + "${_path}" + CACHE INTERNAL "TensorRT_LIBRARY_DIR" + ) endif() - if(NOT TARGET TensorRT::TensorRT) - add_library(TensorRT::TensorRT UNKNOWN IMPORTED) - set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}") - set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}") + if(NOT TensorRT_LIBRARY_DIRS) + get_filename_component(_path ${TensorRT_${component}_LIBRARY} DIRECTORY) + set(TensorRT_LIBRARY_DIRS + "${_path}" + CACHE INTERNAL "TensorRT_LIBRARY_DIRS" + ) endif() -endif() + + # Library found, and doesn't already exists + if(_library_not_found EQUAL -1 AND NOT TARGET trt::${component}) + set(TensorRT_${component}_FOUND + TRUE + CACHE INTERNAL "Found ${component}" + ) + + # Create a target + add_library(trt::${component} IMPORTED INTERFACE) + target_include_directories(trt::${component} SYSTEM INTERFACE "${TensorRT_INCLUDE_DIRS}") + target_link_libraries(trt::${component} INTERFACE "${TensorRT_${component}_LIBRARY}") + set(TensorRT_LIBRARIES ${TensorRT_LIBRARIES} ${TensorRT_${component}_LIBRARY}) + endif() + +endfunction() + +# Find each components +foreach(component IN LISTS TensorRT_FIND_COMPONENTS) + _find_trt_component(${component}) +endforeach() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(TensorRT HANDLE_COMPONENTS VERSION_VAR TensorRT_VERSION REQUIRED_VARS TensorRT_INCLUDE_DIR) diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp similarity index 99% rename from src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ rename to src/Detector/tensorrt_yolo/common/sampleEngines.cpp index 8ada0526..dacf6f2a 100644 --- a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp_ +++ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp @@ -559,7 +559,7 @@ void setLayerDeviceTypes( if (match != layerDeviceTypes.end()) { DeviceType const deviceType = match->second; - sample::gLogInfo << "Set layer " << layerName << " to device type " << deviceType << std::endl; + sample::gLogInfo << "Set layer " << layerName << " to device type " << (int)deviceType << std::endl; config.setDeviceType(layer, deviceType); } } @@ -845,7 +845,11 @@ bool setupNetworkAndConfig(BuildOptions const& build, SystemOptions const& sys, if (build.maxTactics != defaultMaxTactics) { +#if (NV_TENSORRT_MAJOR < 9) config.setMaxNbTactics(build.maxTactics); +#else + config.setTacticSources(build.maxTactics); +#endif } if (build.timingCacheMode == TimingCacheMode::kDISABLE) diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.cpp_ b/src/Detector/tensorrt_yolo/common/sampleInference.cpp similarity index 100% rename from src/Detector/tensorrt_yolo/common/sampleInference.cpp_ rename to src/Detector/tensorrt_yolo/common/sampleInference.cpp diff --git a/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h b/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h deleted file mode 100644 index 9eaac768..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/BatchStream.h +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef BATCH_STREAM_H -#define BATCH_STREAM_H - -#include "NvInfer.h" -#include "common.h" -#include -#include -#include - -class IBatchStream -{ -public: - virtual void reset(int firstBatch) = 0; - virtual bool next() = 0; - virtual void skip(int skipCount) = 0; - virtual float* getBatch() = 0; - virtual float* getLabels() = 0; - virtual int getBatchesRead() const = 0; - virtual int getBatchSize() const = 0; - virtual nvinfer1::Dims getDims() const = 0; -}; - -class MNISTBatchStream : public IBatchStream -{ -public: - MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, const std::string& labelsFile, - const std::vector& directories) - : mBatchSize{batchSize} - , mMaxBatches{maxBatches} - , mDims{3, {1, 28, 28}} //!< We already know the dimensions of MNIST images. - { - readDataFile(locateFile(dataFile, directories)); - readLabelsFile(locateFile(labelsFile, directories)); - } - - void reset(int firstBatch) override - { - mBatchCount = firstBatch; - } - - bool next() override - { - if (mBatchCount >= mMaxBatches) - { - return false; - } - ++mBatchCount; - return true; - } - - void skip(int skipCount) override - { - mBatchCount += skipCount; - } - - float* getBatch() override - { - return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims)); - } - - float* getLabels() override - { - return mLabels.data() + (mBatchCount * mBatchSize); - } - - int getBatchesRead() const override - { - return mBatchCount; - } - - int getBatchSize() const override - { - return mBatchSize; - } - - nvinfer1::Dims getDims() const override - { - return nvinfer1::Dims{4, {mBatchSize, mDims.d[0], mDims.d[1], mDims.d[2]}}; - } - -private: - void readDataFile(const std::string& dataFilePath) - { - std::ifstream file{dataFilePath.c_str(), std::ios::binary}; - - int magicNumber, numImages, imageH, imageW; - file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); - // All values in the MNIST files are big endian. - magicNumber = samplesCommon::swapEndianness(magicNumber); - ASSERT(magicNumber == 2051 && "Magic Number does not match the expected value for an MNIST image set"); - - // Read number of images and dimensions - file.read(reinterpret_cast(&numImages), sizeof(numImages)); - file.read(reinterpret_cast(&imageH), sizeof(imageH)); - file.read(reinterpret_cast(&imageW), sizeof(imageW)); - - numImages = samplesCommon::swapEndianness(numImages); - imageH = samplesCommon::swapEndianness(imageH); - imageW = samplesCommon::swapEndianness(imageW); - - // The MNIST data is made up of unsigned bytes, so we need to cast to float and normalize. - int numElements = numImages * imageH * imageW; - std::vector rawData(numElements); - file.read(reinterpret_cast(rawData.data()), numElements * sizeof(uint8_t)); - mData.resize(numElements); - std::transform( - rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast(val) / 255.f; }); - } - - void readLabelsFile(const std::string& labelsFilePath) - { - std::ifstream file{labelsFilePath.c_str(), std::ios::binary}; - int magicNumber, numImages; - file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); - // All values in the MNIST files are big endian. - magicNumber = samplesCommon::swapEndianness(magicNumber); - ASSERT(magicNumber == 2049 && "Magic Number does not match the expected value for an MNIST labels file"); - - file.read(reinterpret_cast(&numImages), sizeof(numImages)); - numImages = samplesCommon::swapEndianness(numImages); - - std::vector rawLabels(numImages); - file.read(reinterpret_cast(rawLabels.data()), numImages * sizeof(uint8_t)); - mLabels.resize(numImages); - std::transform( - rawLabels.begin(), rawLabels.end(), mLabels.begin(), [](uint8_t val) { return static_cast(val); }); - } - - int mBatchSize{0}; - int mBatchCount{0}; //!< The batch that will be read on the next invocation of next() - int mMaxBatches{0}; - nvinfer1::Dims mDims{}; - std::vector mData{}; - std::vector mLabels{}; -}; - -class BatchStream : public IBatchStream -{ -public: - BatchStream( - int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector directories) - : mBatchSize(batchSize) - , mMaxBatches(maxBatches) - , mPrefix(prefix) - , mSuffix(suffix) - , mDataDir(directories) - { - FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb"); - ASSERT(file != nullptr); - int d[4]; - size_t readSize = fread(d, sizeof(int), 4, file); - ASSERT(readSize == 4); - mDims.nbDims = 4; // The number of dimensions. - mDims.d[0] = d[0]; // Batch Size - mDims.d[1] = d[1]; // Channels - mDims.d[2] = d[2]; // Height - mDims.d[3] = d[3]; // Width - ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0); - fclose(file); - - mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; - mBatch.resize(mBatchSize * mImageSize, 0); - mLabels.resize(mBatchSize, 0); - mFileBatch.resize(mDims.d[0] * mImageSize, 0); - mFileLabels.resize(mDims.d[0], 0); - reset(0); - } - - BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector directories) - : BatchStream(batchSize, maxBatches, prefix, ".batch", directories) - { - } - - BatchStream( - int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector directories) - : mBatchSize(batchSize) - , mMaxBatches(maxBatches) - , mDims(dims) - , mListFile(listFile) - , mDataDir(directories) - { - mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; - mBatch.resize(mBatchSize * mImageSize, 0); - mLabels.resize(mBatchSize, 0); - mFileBatch.resize(mDims.d[0] * mImageSize, 0); - mFileLabels.resize(mDims.d[0], 0); - reset(0); - } - - // Resets data members - void reset(int firstBatch) override - { - mBatchCount = 0; - mFileCount = 0; - mFileBatchPos = mDims.d[0]; - skip(firstBatch); - } - - // Advance to next batch and return true, or return false if there is no batch left. - bool next() override - { - if (mBatchCount == mMaxBatches) - { - return false; - } - - for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize) - { - ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]); - if (mFileBatchPos == mDims.d[0] && !update()) - { - return false; - } - - // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer. - csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); - std::copy_n( - getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize); - std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos); - } - mBatchCount++; - return true; - } - - // Skips the batches - void skip(int skipCount) override - { - if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && mFileBatchPos == mDims.d[0]) - { - mFileCount += skipCount * mBatchSize / mDims.d[0]; - return; - } - - int x = mBatchCount; - for (int i = 0; i < skipCount; i++) - { - next(); - } - mBatchCount = x; - } - - float* getBatch() override - { - return mBatch.data(); - } - - float* getLabels() override - { - return mLabels.data(); - } - - int getBatchesRead() const override - { - return mBatchCount; - } - - int getBatchSize() const override - { - return mBatchSize; - } - - nvinfer1::Dims getDims() const override - { - return mDims; - } - -private: - float* getFileBatch() - { - return mFileBatch.data(); - } - - float* getFileLabels() - { - return mFileLabels.data(); - } - - bool update() - { - if (mListFile.empty()) - { - std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir); - FILE* file = fopen(inputFileName.c_str(), "rb"); - if (!file) - { - return false; - } - - int d[4]; - size_t readSize = fread(d, sizeof(int), 4, file); - ASSERT(readSize == 4); - ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]); - size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file); - ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize)); - size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file); - ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0])); - - fclose(file); - } - else - { - std::vector fNames; - std::ifstream file(locateFile(mListFile, mDataDir), std::ios::binary); - if (!file) - { - return false; - } - - sample::gLogInfo << "Batch #" << mFileCount << std::endl; - file.seekg(((mBatchCount * mBatchSize)) * 7); - - for (int i = 1; i <= mBatchSize; i++) - { - std::string sName; - std::getline(file, sName); - sName = sName + ".ppm"; - sample::gLogInfo << "Calibrating with file " << sName << std::endl; - fNames.emplace_back(sName); - } - - mFileCount++; - - const int imageC = 3; - const int imageH = 300; - const int imageW = 300; - std::vector> ppms(fNames.size()); - for (uint32_t i = 0; i < fNames.size(); ++i) - { - readPPMFile(locateFile(fNames[i], mDataDir), ppms[i]); - } - - std::vector data(samplesCommon::volume(mDims)); - const float scale = 2.0 / 255.0; - const float bias = 1.0; - long int volChl = mDims.d[2] * mDims.d[3]; - - // Normalize input data - for (int i = 0, volImg = mDims.d[1] * mDims.d[2] * mDims.d[3]; i < mBatchSize; ++i) - { - for (int c = 0; c < mDims.d[1]; ++c) - { - for (int j = 0; j < volChl; ++j) - { - data[i * volImg + c * volChl + j] = scale * float(ppms[i].buffer[j * mDims.d[1] + c]) - bias; - } - } - } - - std::copy_n(data.data(), mDims.d[0] * mImageSize, getFileBatch()); - } - - mFileBatchPos = 0; - return true; - } - - int mBatchSize{0}; - int mMaxBatches{0}; - int mBatchCount{0}; - int mFileCount{0}; - int mFileBatchPos{0}; - int mImageSize{0}; - std::vector mBatch; //!< Data for the batch - std::vector mLabels; //!< Labels for the batch - std::vector mFileBatch; //!< List of image files - std::vector mFileLabels; //!< List of label files - std::string mPrefix; //!< Batch file name prefix - std::string mSuffix; //!< Batch file name suffix - nvinfer1::Dims mDims; //!< Input dimensions - std::string mListFile; //!< File name of the list of image names - std::vector mDataDir; //!< Directories where the files can be found -}; - -#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h b/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h deleted file mode 100644 index f31789bf..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/EntropyCalibrator.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ENTROPY_CALIBRATOR_H -#define ENTROPY_CALIBRATOR_H - -#include "BatchStream.h" -#include "NvInfer.h" - -//! \class EntropyCalibratorImpl -//! -//! \brief Implements common functionality for Entropy calibrators. -//! -template -class EntropyCalibratorImpl -{ -public: - EntropyCalibratorImpl( - TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true) - : mStream{stream} - , mCalibrationTableName("CalibrationTable" + networkName) - , mInputBlobName(inputBlobName) - , mReadCache(readCache) - { - nvinfer1::Dims dims = mStream.getDims(); - mInputCount = samplesCommon::volume(dims); - CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float))); - mStream.reset(firstBatch); - } - - virtual ~EntropyCalibratorImpl() - { - CHECK(cudaFree(mDeviceInput)); - } - - int getBatchSize() const noexcept - { - return mStream.getBatchSize(); - } - - bool getBatch(void* bindings[], const char* names[], int /*nbBindings*/) noexcept - { - if (!mStream.next()) - return false; - - CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice)); - ASSERT(!strcmp(names[0], mInputBlobName)); - bindings[0] = mDeviceInput; - return true; - } - - const void* readCalibrationCache(size_t& length) noexcept - { - mCalibrationCache.clear(); - std::ifstream input(mCalibrationTableName, std::ios::binary); - input >> std::noskipws; - if (mReadCache && input.good()) - { - std::copy(std::istream_iterator(input), std::istream_iterator(), - std::back_inserter(mCalibrationCache)); - } - length = mCalibrationCache.size(); - return length ? mCalibrationCache.data() : nullptr; - } - - void writeCalibrationCache(const void* cache, size_t length) noexcept - { - std::ofstream output(mCalibrationTableName, std::ios::binary); - output.write(reinterpret_cast(cache), length); - } - -private: - TBatchStream mStream; - size_t mInputCount; - std::string mCalibrationTableName; - const char* mInputBlobName; - bool mReadCache{true}; - void* mDeviceInput{nullptr}; - std::vector mCalibrationCache; -}; - -//! \class Int8EntropyCalibrator2 -//! -//! \brief Implements Entropy calibrator 2. -//! CalibrationAlgoType is kENTROPY_CALIBRATION_2. -//! -template -class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 -{ -public: - Int8EntropyCalibrator2( - TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true) - : mImpl(stream, firstBatch, networkName, inputBlobName, readCache) - { - } - - int getBatchSize() const noexcept override - { - return mImpl.getBatchSize(); - } - - bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override - { - return mImpl.getBatch(bindings, names, nbBindings); - } - - const void* readCalibrationCache(size_t& length) noexcept override - { - return mImpl.readCalibrationCache(length); - } - - void writeCalibrationCache(const void* cache, size_t length) noexcept override - { - mImpl.writeCalibrationCache(cache, length); - } - -private: - EntropyCalibratorImpl mImpl; -}; - -#endif // ENTROPY_CALIBRATOR_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h b/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h deleted file mode 100644 index 40b35fb5..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/ErrorRecorder.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ERROR_RECORDER_H -#define ERROR_RECORDER_H -#include "NvInferRuntimeCommon.h" -#include "logger.h" -#include -#include -#include -#include -#include - -using nvinfer1::IErrorRecorder; -using nvinfer1::ErrorCode; - -//! -//! A simple implementation of the IErrorRecorder interface for -//! use by samples. This interface also can be used as a reference -//! implementation. -//! The sample Error recorder is based on a vector that pairs the error -//! code and the error string into a single element. It also uses -//! standard mutex's and atomics in order to make sure that the code -//! works in a multi-threaded environment. -//! -class SampleErrorRecorder : public IErrorRecorder -{ - using errorPair = std::pair; - using errorStack = std::vector; - -public: - SampleErrorRecorder() = default; - - virtual ~SampleErrorRecorder() noexcept {} - int32_t getNbErrors() const noexcept final - { - return mErrorStack.size(); - } - ErrorCode getErrorCode(int32_t errorIdx) const noexcept final - { - return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first; - }; - IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final - { - return invalidIndexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str(); - } - // This class can never overflow since we have dynamic resize via std::vector usage. - bool hasOverflowed() const noexcept final - { - return false; - } - - // Empty the errorStack. - void clear() noexcept final - { - try - { - // grab a lock so that there is no addition while clearing. - std::lock_guard guard(mStackLock); - mErrorStack.clear(); - } - catch (const std::exception& e) - { - sample::gLogFatal << "Internal Error: " << e.what() << std::endl; - } - }; - - //! Simple helper function that - bool empty() const noexcept - { - return mErrorStack.empty(); - } - - bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final - { - try - { - std::lock_guard guard(mStackLock); - sample::gLogError << "Error[" << static_cast(val) << "]: " << desc << std::endl; - mErrorStack.push_back(errorPair(val, desc)); - } - catch (const std::exception& e) - { - sample::gLogFatal << "Internal Error: " << e.what() << std::endl; - } - // All errors are considered fatal. - return true; - } - - // Atomically increment or decrement the ref counter. - IErrorRecorder::RefCount incRefCount() noexcept final - { - return ++mRefCount; - } - IErrorRecorder::RefCount decRefCount() noexcept final - { - return --mRefCount; - } - -private: - // Simple helper functions. - const errorPair& operator[](size_t index) const noexcept - { - return mErrorStack[index]; - } - - bool invalidIndexCheck(int32_t index) const noexcept - { - // By converting signed to unsigned, we only need a single check since - // negative numbers turn into large positive greater than the size. - size_t sIndex = index; - return sIndex >= mErrorStack.size(); - } - // Mutex to hold when locking mErrorStack. - std::mutex mStackLock; - - // Reference count of the class. Destruction of the class when mRefCount - // is not zero causes undefined behavior. - std::atomic mRefCount{0}; - - // The error stack that holds the errors recorded by TensorRT. - errorStack mErrorStack; -}; // class SampleErrorRecorder -#endif // ERROR_RECORDER_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/buffers.h b/src/Detector/tensorrt_yolo/common_deprecated/buffers.h deleted file mode 100644 index ef673b2b..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/buffers.h +++ /dev/null @@ -1,478 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef TENSORRT_BUFFERS_H -#define TENSORRT_BUFFERS_H - -#include "NvInfer.h" -#include "common.h" -#include "half.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace samplesCommon -{ - -//! -//! \brief The GenericBuffer class is a templated class for buffers. -//! -//! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation, -//! deallocation, querying of buffers on both the device and the host. -//! It can handle data of arbitrary types because it stores byte buffers. -//! The template parameters AllocFunc and FreeFunc are used for the -//! allocation and deallocation of the buffer. -//! AllocFunc must be a functor that takes in (void** ptr, size_t size) -//! and returns bool. ptr is a pointer to where the allocated buffer address should be stored. -//! size is the amount of memory in bytes to allocate. -//! The boolean indicates whether or not the memory allocation was successful. -//! FreeFunc must be a functor that takes in (void* ptr) and returns void. -//! ptr is the allocated buffer address. It must work with nullptr input. -//! -template -class GenericBuffer -{ -public: - //! - //! \brief Construct an empty buffer. - //! - GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT) - : mSize(0) - , mCapacity(0) - , mType(type) - , mBuffer(nullptr) - { - } - - //! - //! \brief Construct a buffer with the specified allocation size in bytes. - //! - GenericBuffer(size_t size, nvinfer1::DataType type) - : mSize(size) - , mCapacity(size) - , mType(type) - { - if (!allocFn(&mBuffer, this->nbBytes())) - { - throw std::bad_alloc(); - } - } - - GenericBuffer(GenericBuffer&& buf) - : mSize(buf.mSize) - , mCapacity(buf.mCapacity) - , mType(buf.mType) - , mBuffer(buf.mBuffer) - { - buf.mSize = 0; - buf.mCapacity = 0; - buf.mType = nvinfer1::DataType::kFLOAT; - buf.mBuffer = nullptr; - } - - GenericBuffer& operator=(GenericBuffer&& buf) - { - if (this != &buf) - { - freeFn(mBuffer); - mSize = buf.mSize; - mCapacity = buf.mCapacity; - mType = buf.mType; - mBuffer = buf.mBuffer; - // Reset buf. - buf.mSize = 0; - buf.mCapacity = 0; - buf.mBuffer = nullptr; - } - return *this; - } - - //! - //! \brief Returns pointer to underlying array. - //! - void* data() - { - return mBuffer; - } - - //! - //! \brief Returns pointer to underlying array. - //! - const void* data() const - { - return mBuffer; - } - - //! - //! \brief Returns the size (in number of elements) of the buffer. - //! - size_t size() const - { - return mSize; - } - - //! - //! \brief Returns the size (in bytes) of the buffer. - //! - size_t nbBytes() const - { - return this->size() * samplesCommon::getElementSize(mType); - } - - //! - //! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity. - //! - void resize(size_t newSize) - { - mSize = newSize; - if (mCapacity < newSize) - { - freeFn(mBuffer); - if (!allocFn(&mBuffer, this->nbBytes())) - { - throw std::bad_alloc{}; - } - mCapacity = newSize; - } - } - - //! - //! \brief Overload of resize that accepts Dims - //! - void resize(const nvinfer1::Dims& dims) - { - return this->resize(samplesCommon::volume(dims)); - } - - ~GenericBuffer() - { - freeFn(mBuffer); - } - -private: - size_t mSize{0}, mCapacity{0}; - nvinfer1::DataType mType; - void* mBuffer; - AllocFunc allocFn; - FreeFunc freeFn; -}; - -class DeviceAllocator -{ -public: - bool operator()(void** ptr, size_t size) const - { - return cudaMalloc(ptr, size) == cudaSuccess; - } -}; - -class DeviceFree -{ -public: - void operator()(void* ptr) const - { - cudaFree(ptr); - } -}; - -class HostAllocator -{ -public: - bool operator()(void** ptr, size_t size) const - { - *ptr = malloc(size); - return *ptr != nullptr; - } -}; - -class HostFree -{ -public: - void operator()(void* ptr) const - { - free(ptr); - } -}; - -using DeviceBuffer = GenericBuffer; -using HostBuffer = GenericBuffer; - -//! -//! \brief The ManagedBuffer class groups together a pair of corresponding device and host buffers. -//! -class ManagedBuffer -{ -public: - DeviceBuffer deviceBuffer; - HostBuffer hostBuffer; -}; - -//! -//! \brief The BufferManager class handles host and device buffer allocation and deallocation. -//! -//! \details This RAII class handles host and device buffer allocation and deallocation, -//! memcpy between host and device buffers to aid with inference, -//! and debugging dumps to validate inference. The BufferManager class is meant to be -//! used to simplify buffer management and any interactions between buffers and the engine. -//! -class BufferManager -{ -public: - static const size_t kINVALID_SIZE_VALUE = ~size_t(0); - - //! - //! \brief Create a BufferManager for handling buffer interactions with engine. - //! - BufferManager(std::shared_ptr engine, const int batchSize, - const nvinfer1::IExecutionContext* context = nullptr) - : mEngine(engine) - , mBatchSize(batchSize) - { - // Full Dims implies no batch size. - auto impbs = engine->hasImplicitBatchDimension(); - std::cout << "hasImplicitBatchDimension: " << impbs << ", mBatchSize = " << mBatchSize << std::endl; - assert(engine->hasImplicitBatchDimension() || mBatchSize == 0); - // Create host and device buffers - for (int i = 0; i < mEngine->getNbBindings(); i++) - { - auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i); - size_t vol = context || !mBatchSize ? 1 : static_cast(mBatchSize); - nvinfer1::DataType type = mEngine->getBindingDataType(i); - int vecDim = mEngine->getBindingVectorizedDim(i); - if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector - { - int scalarsPerVec = mEngine->getBindingComponentsPerElement(i); - dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec); - vol *= scalarsPerVec; - } - vol *= samplesCommon::volume(dims); - std::unique_ptr manBuf{new ManagedBuffer()}; - manBuf->deviceBuffer = DeviceBuffer(vol, type); - manBuf->hostBuffer = HostBuffer(vol, type); - mDeviceBindings.emplace_back(manBuf->deviceBuffer.data()); - mManagedBuffers.emplace_back(std::move(manBuf)); - } - } - - //! - //! \brief Returns a vector of device buffers that you can use directly as - //! bindings for the execute and enqueue methods of IExecutionContext. - //! - std::vector& getDeviceBindings() - { - return mDeviceBindings; - } - - //! - //! \brief Returns a vector of device buffers. - //! - const std::vector& getDeviceBindings() const - { - return mDeviceBindings; - } - - //! - //! \brief Returns the device buffer corresponding to tensorName. - //! Returns nullptr if no such tensor can be found. - //! - void* getDeviceBuffer(const std::string& tensorName) const - { - return getBuffer(false, tensorName); - } - - //! - //! \brief Returns the host buffer corresponding to tensorName. - //! Returns nullptr if no such tensor can be found. - //! - void* getHostBuffer(const std::string& tensorName) const - { - return getBuffer(true, tensorName); - } - - //! - //! \brief Returns the host buffer corresponding to tensorName. - //! Returns nullptr if no such tensor can be found. - //! - void* getHostBuffer(int bindingIndex) const - { - return getBuffer(true, bindingIndex); - } - - //! - //! \brief Returns the size of the host and device buffers that correspond to tensorName. - //! Returns kINVALID_SIZE_VALUE if no such tensor can be found. - //! - size_t size(const std::string& tensorName) const - { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) - return kINVALID_SIZE_VALUE; - return mManagedBuffers[index]->hostBuffer.nbBytes(); - } - - //! - //! \brief Dump host buffer with specified tensorName to ostream. - //! Prints error message to std::ostream if no such tensor can be found. - //! - void dumpBuffer(std::ostream& os, const std::string& tensorName) - { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) - { - os << "Invalid tensor name" << std::endl; - return; - } - void* buf = mManagedBuffers[index]->hostBuffer.data(); - size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes(); - nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index); - size_t rowCount = static_cast(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize); - int leadDim = mBatchSize; - int* trailDims = bufDims.d; - int nbDims = bufDims.nbDims; - - // Fix explicit Dimension networks - if (!leadDim && nbDims > 0) - { - leadDim = bufDims.d[0]; - ++trailDims; - --nbDims; - } - - os << "[" << leadDim; - for (int i = 0; i < nbDims; i++) - os << ", " << trailDims[i]; - os << "]" << std::endl; - switch (mEngine->getBindingDataType(index)) - { - case nvinfer1::DataType::kINT32: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kFLOAT: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kHALF: print(os, buf, bufSize, rowCount); break; - case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break; - case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break; - } - } - - //! - //! \brief Templated print function that dumps buffers of arbitrary type to std::ostream. - //! rowCount parameter controls how many elements are on each line. - //! A rowCount of 1 means that there is only 1 element on each line. - //! - template - void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount) - { - assert(rowCount != 0); - assert(bufSize % sizeof(T) == 0); - T* typedBuf = static_cast(buf); - size_t numItems = bufSize / sizeof(T); - for (int i = 0; i < static_cast(numItems); i++) - { - // Handle rowCount == 1 case - if (rowCount == 1 && i != static_cast(numItems) - 1) - os << typedBuf[i] << std::endl; - else if (rowCount == 1) - os << typedBuf[i]; - // Handle rowCount > 1 case - else if (i % rowCount == 0) - os << typedBuf[i]; - else if (i % rowCount == rowCount - 1) - os << " " << typedBuf[i] << std::endl; - else - os << " " << typedBuf[i]; - } - } - - //! - //! \brief Copy the contents of input host buffers to input device buffers synchronously. - //! - void copyInputToDevice() - { - memcpyBuffers(true, false, false, 0); - } - - //! - //! \brief Copy the contents of output device buffers to output host buffers synchronously. - //! - void copyOutputToHost() - { - memcpyBuffers(false, true, false, 0); - } - - //! - //! \brief Copy the contents of input host buffers to input device buffers asynchronously. - //! - void copyInputToDeviceAsync(const cudaStream_t& stream) - { - memcpyBuffers(true, false, true, stream); - } - - //! - //! \brief Copy the contents of output device buffers to output host buffers asynchronously. - //! - void copyOutputToHostAsync(const cudaStream_t& stream) - { - memcpyBuffers(false, true, true, stream); - } - - ~BufferManager() = default; - -private: - void* getBuffer(const bool isHost, const std::string& tensorName) const - { - int index = mEngine->getBindingIndex(tensorName.c_str()); - if (index == -1) - return nullptr; - return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data()); - } - - void* getBuffer(const bool isHost, int bindingIndex) const - { - if (bindingIndex == -1) - return nullptr; - return (isHost ? mManagedBuffers[bindingIndex]->hostBuffer.data() : mManagedBuffers[bindingIndex]->deviceBuffer.data()); - } - - void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream) - { - for (int i = 0; i < mEngine->getNbBindings(); i++) - { - void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data(); - const void* srcPtr = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data(); - const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes(); - const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice; - if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i))) - { - if (async) - CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream)); - else - CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType)); - } - } - } - - std::shared_ptr mEngine; //!< The pointer to the engine - int mBatchSize = 0; //!< The batch size for legacy networks, 0 otherwise. - std::vector> mManagedBuffers; //!< The vector of pointers to managed buffers - std::vector mDeviceBindings; //!< The vector of device buffers needed for engine execution -}; - -} // namespace samplesCommon - -#endif // TENSORRT_BUFFERS_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/common.h b/src/Detector/tensorrt_yolo/common_deprecated/common.h deleted file mode 100644 index 2270a2cd..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/common.h +++ /dev/null @@ -1,963 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TENSORRT_COMMON_H -#define TENSORRT_COMMON_H - -// For loadLibrary -#ifdef _MSC_VER -// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#undef NOMINMAX -#else -#include -#endif - -#include "NvInfer.h" -#include "NvInferPlugin.h" -#include "logger.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "safeCommon.h" - -#ifdef _MSC_VER -#define FN_NAME __FUNCTION__ -#else -#define FN_NAME __func__ -#endif - -#if defined(__aarch64__) || defined(__QNX__) -#define ENABLE_DLA_API 1 -#endif - -#define CHECK_RETURN_W_MSG(status, val, errMsg) \ - do \ - { \ - if (!(status)) \ - { \ - sample::gLogError << errMsg << " Error in " << __FILE__ << ", function " << FN_NAME << "(), line " << __LINE__ \ - << std::endl; \ - return val; \ - } \ - } while (0) - -#undef ASSERT -#define ASSERT(condition) \ - do \ - { \ - if (!(condition)) \ - { \ - sample::gLogError << "Assertion failure: " << #condition << std::endl; \ - abort(); \ - } \ - } while (0) - - -#define CHECK_RETURN(status, val) CHECK_RETURN_W_MSG(status, val, "") - -#define OBJ_GUARD(A) std::unique_ptr - -template -OBJ_GUARD(T) -makeObjGuard(T_* t) -{ - CHECK(!(std::is_base_of::value || std::is_same::value)); - auto deleter = [](T* t) { t->destroy(); }; - return std::unique_ptr{static_cast(t), deleter}; -} - -constexpr long double operator"" _GiB(long double val) -{ - return val * (1 << 30); -} -constexpr long double operator"" _MiB(long double val) -{ - return val * (1 << 20); -} -constexpr long double operator"" _KiB(long double val) -{ - return val * (1 << 10); -} - -// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB. -// Since the return type is signed, -1_GiB will work as expected. -constexpr long long int operator"" _GiB(unsigned long long val) -{ - return val * (1 << 30); -} -constexpr long long int operator"" _MiB(unsigned long long val) -{ - return val * (1 << 20); -} -constexpr long long int operator"" _KiB(unsigned long long val) -{ - return val * (1 << 10); -} - -struct SimpleProfiler : public nvinfer1::IProfiler -{ - struct Record - { - float time{0}; - int count{0}; - }; - - virtual void reportLayerTime(const char* layerName, float ms) noexcept - { - mProfile[layerName].count++; - mProfile[layerName].time += ms; - if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end()) - { - mLayerNames.push_back(layerName); - } - } - - SimpleProfiler(const char* name, const std::vector& srcProfilers = std::vector()) - : mName(name) - { - for (const auto& srcProfiler : srcProfilers) - { - for (const auto& rec : srcProfiler.mProfile) - { - auto it = mProfile.find(rec.first); - if (it == mProfile.end()) - { - mProfile.insert(rec); - } - else - { - it->second.time += rec.second.time; - it->second.count += rec.second.count; - } - } - } - } - - friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value) - { - out << "========== " << value.mName << " profile ==========" << std::endl; - float totalTime = 0; - std::string layerNameStr = "TensorRT layer name"; - int maxLayerNameLength = std::max(static_cast(layerNameStr.size()), 70); - for (const auto& elem : value.mProfile) - { - totalTime += elem.second.time; - maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); - } - - auto old_settings = out.flags(); - auto old_precision = out.precision(); - // Output header - { - out << std::setw(maxLayerNameLength) << layerNameStr << " "; - out << std::setw(12) << "Runtime, " - << "%" - << " "; - out << std::setw(12) << "Invocations" - << " "; - out << std::setw(12) << "Runtime, ms" << std::endl; - } - for (size_t i = 0; i < value.mLayerNames.size(); i++) - { - const std::string layerName = value.mLayerNames[i]; - auto elem = value.mProfile.at(layerName); - out << std::setw(maxLayerNameLength) << layerName << " "; - out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%" - << " "; - out << std::setw(12) << elem.count << " "; - out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl; - } - out.flags(old_settings); - out.precision(old_precision); - out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl; - - return out; - } - -private: - std::string mName; - std::vector mLayerNames; - std::map mProfile; -}; - -//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in. -//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path. -inline std::string locateFile( - const std::string& filepathSuffix, const std::vector& directories, bool reportError = true) -{ - const int MAX_DEPTH{10}; - bool found{false}; - std::string filepath; - - for (auto& dir : directories) - { - if (!dir.empty() && dir.back() != '/') - { -#ifdef _MSC_VER - filepath = dir + "\\" + filepathSuffix; -#else - filepath = dir + "/" + filepathSuffix; -#endif - } - else - { - filepath = dir + filepathSuffix; - } - - for (int i = 0; i < MAX_DEPTH && !found; i++) - { - const std::ifstream checkFile(filepath); - found = checkFile.is_open(); - if (found) - { - break; - } - - filepath = "../" + filepath; // Try again in parent dir - } - - if (found) - { - break; - } - - filepath.clear(); - } - - // Could not find the file - if (filepath.empty()) - { - const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), - [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); - std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl; - - if (reportError) - { - std::cout << "&&&& FAILED" << std::endl; - exit(EXIT_FAILURE); - } - } - - return filepath; -} - -inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW) -{ - std::ifstream infile(fileName, std::ifstream::binary); - assert(infile.is_open() && "Attempting to read from a file that is not open."); - std::string magic, h, w, max; - infile >> magic >> h >> w >> max; - infile.seekg(1, infile.cur); - infile.read(reinterpret_cast(buffer), inH * inW); -} - -namespace samplesCommon -{ - -// Swaps endianness of an integral type. -template ::value, int>::type = 0> -inline T swapEndianness(const T& value) -{ - uint8_t bytes[sizeof(T)]; - for (int i = 0; i < static_cast(sizeof(T)); ++i) - { - bytes[sizeof(T) - 1 - i] = *(reinterpret_cast(&value) + i); - } - return *reinterpret_cast(bytes); -} - -class HostMemory -{ -public: - HostMemory() = delete; - virtual void* data() const noexcept - { - return mData; - } - virtual std::size_t size() const noexcept - { - return mSize; - } - virtual nvinfer1::DataType type() const noexcept - { - return mType; - } - virtual ~HostMemory() {} - -protected: - HostMemory(std::size_t size, nvinfer1::DataType type) - : mData{nullptr} - , mSize(size) - , mType(type) - { - } - void* mData; - std::size_t mSize; - nvinfer1::DataType mType; -}; - -template -class TypedHostMemory : public HostMemory -{ -public: - explicit TypedHostMemory(std::size_t size) - : HostMemory(size, dataType) - { - mData = new ElemType[size]; - }; - ~TypedHostMemory() noexcept - { - delete[](ElemType*) mData; - } - ElemType* raw() noexcept - { - return static_cast(data()); - } -}; - -using FloatMemory = TypedHostMemory; -using HalfMemory = TypedHostMemory; -using ByteMemory = TypedHostMemory; - -inline void* safeCudaMalloc(size_t memSize) -{ - void* deviceMem; - CHECK(cudaMalloc(&deviceMem, memSize)); - if (deviceMem == nullptr) - { - std::cerr << "Out of memory" << std::endl; - exit(1); - } - return deviceMem; -} - -inline bool isDebug() -{ - return (std::getenv("TENSORRT_DEBUG") ? true : false); -} - -struct InferDeleter -{ - template - void operator()(T* obj) const - { -#if (NV_TENSORRT_MAJOR < 8) - obj->destroy(); -#else - delete obj; -#endif - } -}; - -template -using SampleUniquePtr = std::unique_ptr; - -static auto StreamDeleter = [](cudaStream_t* pStream) - { - if (pStream) - { - cudaStreamDestroy(*pStream); - delete pStream; - } - }; - -inline std::unique_ptr makeCudaStream() -{ - std::unique_ptr pStream(new cudaStream_t, StreamDeleter); - if (cudaStreamCreateWithFlags(pStream.get(), cudaStreamNonBlocking) != cudaSuccess) - { - pStream.reset(nullptr); - } - - return pStream; -} - -//! Return vector of indices that puts magnitudes of sequence in descending order. -template -std::vector argMagnitudeSort(Iter begin, Iter end) -{ - std::vector indices(end - begin); - std::iota(indices.begin(), indices.end(), 0); - std::sort(indices.begin(), indices.end(), [&begin](size_t i, size_t j) { return std::abs(begin[j]) < std::abs(begin[i]); }); - return indices; -} - -inline bool readReferenceFile(const std::string& fileName, std::vector& refVector) -{ - std::ifstream infile(fileName); - if (!infile.is_open()) - { - std::cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << std::endl; - return false; - } - std::string line; - while (std::getline(infile, line)) - { - if (line.empty()) - continue; - refVector.push_back(line); - } - infile.close(); - return true; -} - -template -std::vector classify( - const std::vector& refVector, const std::vector& output, const size_t topK) -{ - const auto inds = samplesCommon::argMagnitudeSort(output.cbegin(), output.cend()); - std::vector result; - result.reserve(topK); - for (size_t k = 0; k < topK; ++k) - { - result.push_back(refVector[inds[k]]); - } - return result; -} - -// Returns indices of highest K magnitudes in v. -template -std::vector topKMagnitudes(const std::vector& v, const size_t k) -{ - std::vector indices = samplesCommon::argMagnitudeSort(v.cbegin(), v.cend()); - indices.resize(k); - return indices; -} - -template -bool readASCIIFile(const std::string& fileName, const size_t size, std::vector& out) -{ - std::ifstream infile(fileName); - if (!infile.is_open()) - { - std::cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << std::endl; - return false; - } - out.clear(); - out.reserve(size); - out.assign(std::istream_iterator(infile), std::istream_iterator()); - infile.close(); - return true; -} - -template -bool writeASCIIFile(const std::string& fileName, const std::vector& in) -{ - std::ofstream outfile(fileName); - if (!outfile.is_open()) - { - std::cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << std::endl; - return false; - } - for (auto fn : in) - { - outfile << fn << "\n"; - } - outfile.close(); - return true; -} - -inline void print_version() -{ - std::cout << " TensorRT version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH - << "." << NV_TENSORRT_BUILD << std::endl; -} - -inline std::string getFileType(const std::string& filepath) -{ - return filepath.substr(filepath.find_last_of(".") + 1); -} - -inline std::string toLower(const std::string& inp) -{ - std::string out = inp; - std::transform(out.begin(), out.end(), out.begin(), ::tolower); - return out; -} - -inline float getMaxValue(const float* buffer, int64_t size) -{ - assert(buffer != nullptr); - assert(size > 0); - return *std::max_element(buffer, buffer + size); -} - -// Ensures that every tensor used by a network has a dynamic range set. -// -// All tensors in a network must have a dynamic range specified if a calibrator is not used. -// This function is just a utility to globally fill in missing scales and zero-points for the entire network. -// -// If a tensor does not have a dyanamic range set, it is assigned inRange or outRange as follows: -// -// * If the tensor is the input to a layer or output of a pooling node, its dynamic range is derived from inRange. -// * Otherwise its dynamic range is derived from outRange. -// -// The default parameter values are intended to demonstrate, for final layers in the network, -// cases where dynamic ranges are asymmetric. -// -// The default parameter values choosen arbitrarily. Range values should be choosen such that -// we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor. -inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f) -{ - // Ensure that all layer inputs have a scale. - for (int i = 0; i < network->getNbLayers(); i++) - { - auto layer = network->getLayer(i); - for (int j = 0; j < layer->getNbInputs(); j++) - { - nvinfer1::ITensor* input{layer->getInput(j)}; - // Optional inputs are nullptr here and are from RNN layers. - if (input != nullptr && !input->dynamicRangeIsSet()) - { - ASSERT(input->setDynamicRange(-inRange, inRange)); - } - } - } - - // Ensure that all layer outputs have a scale. - // Tensors that are also inputs to layers are ingored here - // since the previous loop nest assigned scales to them. - for (int i = 0; i < network->getNbLayers(); i++) - { - auto layer = network->getLayer(i); - for (int j = 0; j < layer->getNbOutputs(); j++) - { - nvinfer1::ITensor* output{layer->getOutput(j)}; - // Optional outputs are nullptr here and are from RNN layers. - if (output != nullptr && !output->dynamicRangeIsSet()) - { - // Pooling must have the same input and output scales. - if (layer->getType() == nvinfer1::LayerType::kPOOLING) - { - ASSERT(output->setDynamicRange(-inRange, inRange)); - } - else - { - ASSERT(output->setDynamicRange(-outRange, outRange)); - } - } - } - } -} - -inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer1::INetworkDefinition* n) -{ - // Set dummy per-tensor dynamic range if Int8 mode is requested. - if (c->getFlag(nvinfer1::BuilderFlag::kINT8)) - { - sample::gLogWarning - << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy is not guaranteed." - << std::endl; - setAllDynamicRanges(n); - } -} - -inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) -{ - if (useDLACore >= 0) - { - if (builder->getNbDLACores() == 0) - { - std::cerr << "Trying to use DLA core " << useDLACore << " on a platform that doesn't have any DLA cores" - << std::endl; - assert("Error: use DLA core on a platfrom that doesn't have any DLA cores" && false); - } - if (allowGPUFallback) - { - config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); - } - if (!config->getFlag(nvinfer1::BuilderFlag::kINT8)) - { - // User has not requested INT8 Mode. - // By default run in FP16 mode. FP32 mode is not permitted. - config->setFlag(nvinfer1::BuilderFlag::kFP16); - } - config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); - config->setDLACore(useDLACore); - } -} - -inline int32_t parseDLA(int32_t argc, char** argv) -{ - for (int32_t i = 1; i < argc; i++) - { - if (strncmp(argv[i], "--useDLACore=", 13) == 0) - { - return std::stoi(argv[i] + 13); - } - } - return -1; -} - -inline uint32_t getElementSize(nvinfer1::DataType t) noexcept -{ - switch (t) - { - case nvinfer1::DataType::kINT32: return 4; - case nvinfer1::DataType::kFLOAT: return 4; - case nvinfer1::DataType::kHALF: return 2; - case nvinfer1::DataType::kBOOL: - case nvinfer1::DataType::kINT8: return 1; - } - return 0; -} - -inline int64_t volume(const nvinfer1::Dims& d) -{ - return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); -} - -template -struct PPM -{ - std::string magic, fileName; - int h, w, max; - uint8_t buffer[C * H * W]; -}; - -// New vPPM(variable sized PPM) class with variable dimensions. -struct vPPM -{ - std::string magic, fileName; - int h, w, max; - std::vector buffer; -}; - -struct BBox -{ - float x1, y1, x2, y2; -}; - -template -void readPPMFile(const std::string& filename, samplesCommon::PPM& ppm) -{ - ppm.fileName = filename; - std::ifstream infile(filename, std::ifstream::binary); - assert(infile.is_open() && "Attempting to read from a file that is not open."); - infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; - infile.seekg(1, infile.cur); - infile.read(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); -} - -inline void readPPMFile(const std::string& filename, vPPM& ppm, std::vector& input_dir) -{ - ppm.fileName = filename; - std::ifstream infile(locateFile(filename, input_dir), std::ifstream::binary); - infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; - infile.seekg(1, infile.cur); - - for (int i = 0; i < ppm.w * ppm.h * 3; ++i) - { - ppm.buffer.push_back(0); - } - - infile.read(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); -} - -template -void writePPMFileWithBBox(const std::string& filename, PPM& ppm, const BBox& bbox) -{ - std::ofstream outfile("./" + filename, std::ofstream::binary); - assert(!outfile.fail()); - outfile << "P6" - << "\n" - << ppm.w << " " << ppm.h << "\n" - << ppm.max << "\n"; - - auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; - const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1); - const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1); - const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1); - const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1); - - for (int x = x1; x <= x2; ++x) - { - // bbox top border - ppm.buffer[(y1 * ppm.w + x) * 3] = 255; - ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0; - ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0; - // bbox bottom border - ppm.buffer[(y2 * ppm.w + x) * 3] = 255; - ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0; - ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0; - } - - for (int y = y1; y <= y2; ++y) - { - // bbox left border - ppm.buffer[(y * ppm.w + x1) * 3] = 255; - ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0; - ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0; - // bbox right border - ppm.buffer[(y * ppm.w + x2) * 3] = 255; - ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0; - ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0; - } - - outfile.write(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); -} - -inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vector& dets) -{ - std::ofstream outfile("./" + filename, std::ofstream::binary); - assert(!outfile.fail()); - outfile << "P6" - << "\n" - << ppm.w << " " << ppm.h << "\n" - << ppm.max << "\n"; - auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; - - for (auto bbox : dets) - { - for (int x = int(bbox.x1); x < int(bbox.x2); ++x) - { - // bbox top border - ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3] = 255; - ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 1] = 0; - ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 2] = 0; - // bbox bottom border - ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3] = 255; - ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 1] = 0; - ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 2] = 0; - } - - for (int y = int(bbox.y1); y < int(bbox.y2); ++y) - { - // bbox left border - ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3] = 255; - ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 1] = 0; - ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 2] = 0; - // bbox right border - ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3] = 255; - ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 1] = 0; - ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 2] = 0; - } - } - - outfile.write(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); -} - -class TimerBase -{ -public: - virtual void start() {} - virtual void stop() {} - float microseconds() const noexcept - { - return mMs * 1000.f; - } - float milliseconds() const noexcept - { - return mMs; - } - float seconds() const noexcept - { - return mMs / 1000.f; - } - void reset() noexcept - { - mMs = 0.f; - } - -protected: - float mMs{0.0f}; -}; - -class GpuTimer : public TimerBase -{ -public: - explicit GpuTimer(cudaStream_t stream) - : mStream(stream) - { - CHECK(cudaEventCreate(&mStart)); - CHECK(cudaEventCreate(&mStop)); - } - ~GpuTimer() - { - CHECK(cudaEventDestroy(mStart)); - CHECK(cudaEventDestroy(mStop)); - } - void start() - { - CHECK(cudaEventRecord(mStart, mStream)); - } - void stop() - { - CHECK(cudaEventRecord(mStop, mStream)); - float ms{0.0f}; - CHECK(cudaEventSynchronize(mStop)); - CHECK(cudaEventElapsedTime(&ms, mStart, mStop)); - mMs += ms; - } - -private: - cudaEvent_t mStart, mStop; - cudaStream_t mStream; -}; // class GpuTimer - -template -class CpuTimer : public TimerBase -{ -public: - using clock_type = Clock; - - void start() - { - mStart = Clock::now(); - } - void stop() - { - mStop = Clock::now(); - mMs += std::chrono::duration{mStop - mStart}.count(); - } - -private: - std::chrono::time_point mStart, mStop; -}; // class CpuTimer - -using PreciseCpuTimer = CpuTimer; - -inline std::vector splitString(std::string str, char delimiter = ',') -{ - std::vector splitVect; - std::stringstream ss(str); - std::string substr; - - while (ss.good()) - { - getline(ss, substr, delimiter); - splitVect.emplace_back(std::move(substr)); - } - return splitVect; -} - -// Return m rounded up to nearest multiple of n -inline int roundUp(int m, int n) -{ - return ((m + n - 1) / n) * n; -} - -inline int getC(const nvinfer1::Dims& d) -{ - return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1; -} - -inline int getH(const nvinfer1::Dims& d) -{ - return d.nbDims >= 2 ? d.d[d.nbDims - 2] : 1; -} - -inline int getW(const nvinfer1::Dims& d) -{ - return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1; -} - -inline void loadLibrary(const std::string& path) -{ -#ifdef _MSC_VER - void* handle = LoadLibrary(path.c_str()); -#else - int32_t flags{RTLD_LAZY}; -#if ENABLE_ASAN - // https://github.com/google/sanitizers/issues/89 - // asan doesn't handle module unloading correctly and there are no plans on doing - // so. In order to get proper stack traces, don't delete the shared library on - // close so that asan can resolve the symbols correctly. - flags |= RTLD_NODELETE; -#endif // ENABLE_ASAN - - void* handle = dlopen(path.c_str(), flags); -#endif - if (handle == nullptr) - { -#ifdef _MSC_VER - sample::gLogError << "Could not load plugin library: " << path << std::endl; -#else - sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl; -#endif - } -} - -inline int32_t getSMVersion() -{ - int32_t deviceIndex = 0; - CHECK(cudaGetDevice(&deviceIndex)); - - int32_t major, minor; - CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); - CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); - - return ((major << 8) | minor); -} - -inline bool isSMSafe() -{ - const int32_t smVersion = getSMVersion(); - return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 || - smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807; -} - -inline bool isDataTypeSupported(nvinfer1::DataType dataType) -{ - auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); - if (!builder) - { - return false; - } - - if ((dataType == nvinfer1::DataType::kINT8 && !builder->platformHasFastInt8()) - || (dataType == nvinfer1::DataType::kHALF && !builder->platformHasFastFp16())) - { - return false; - } - - return true; -} - -} // namespace samplesCommon - -inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) -{ - os << "("; - for (int i = 0; i < dims.nbDims; ++i) - { - os << (i ? ", " : "") << dims.d[i]; - } - return os << ")"; -} - -#endif // TENSORRT_COMMON_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/half.h b/src/Detector/tensorrt_yolo/common_deprecated/half.h deleted file mode 100644 index 0755c316..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/half.h +++ /dev/null @@ -1,4302 +0,0 @@ -// half - IEEE 754-based half-precision floating point library. -// -// Copyright (c) 2012-2017 Christian Rau -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -// documentation files (the "Software"), to deal in the Software without restriction, including without limitation the -// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -// permit persons to whom the Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the -// Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR -// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Version 1.12.0 - -/// \file -/// Main header file for half precision functionality. - -#ifndef HALF_HALF_HPP -#define HALF_HALF_HPP - -/// Combined gcc version number. -#define HALF_GNUC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -// check C++11 language features -#if defined(__clang__) // clang -#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) -#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 -#endif -#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR) -#define HALF_ENABLE_CPP11_CONSTEXPR 1 -#endif -#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT) -#define HALF_ENABLE_CPP11_NOEXCEPT 1 -#endif -#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS) -#define HALF_ENABLE_CPP11_USER_LITERALS 1 -#endif -#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG) -#define HALF_ENABLE_CPP11_LONG_LONG 1 -#endif -/*#elif defined(__INTEL_COMPILER) //Intel C++ - #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) ???????? - #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 - #endif - #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) ???????? - #define HALF_ENABLE_CPP11_CONSTEXPR 1 - #endif - #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) ???????? - #define HALF_ENABLE_CPP11_NOEXCEPT 1 - #endif - #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG) ???????? - #define HALF_ENABLE_CPP11_LONG_LONG 1 - #endif*/ -#elif defined(__GNUC__) // gcc -#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L -#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) -#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 -#endif -#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) -#define HALF_ENABLE_CPP11_CONSTEXPR 1 -#endif -#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) -#define HALF_ENABLE_CPP11_NOEXCEPT 1 -#endif -#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) -#define HALF_ENABLE_CPP11_USER_LITERALS 1 -#endif -#if !defined(HALF_ENABLE_CPP11_LONG_LONG) -#define HALF_ENABLE_CPP11_LONG_LONG 1 -#endif -#endif -#elif defined(_MSC_VER) // Visual C++ -#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) -#define HALF_ENABLE_CPP11_CONSTEXPR 1 -#endif -#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) -#define HALF_ENABLE_CPP11_NOEXCEPT 1 -#endif -#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) -#define HALF_ENABLE_CPP11_USER_LITERALS 1 -#endif -#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) -#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 -#endif -#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG) -#define HALF_ENABLE_CPP11_LONG_LONG 1 -#endif -#define HALF_POP_WARNINGS 1 -#pragma warning(push) -#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, negative unsigned -#endif - -// check C++11 library features -#include -#if defined(_LIBCPP_VERSION) // libc++ -#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 -#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS -#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 -#endif -#ifndef HALF_ENABLE_CPP11_CSTDINT -#define HALF_ENABLE_CPP11_CSTDINT 1 -#endif -#ifndef HALF_ENABLE_CPP11_CMATH -#define HALF_ENABLE_CPP11_CMATH 1 -#endif -#ifndef HALF_ENABLE_CPP11_HASH -#define HALF_ENABLE_CPP11_HASH 1 -#endif -#endif -#elif defined(__GLIBCXX__) // libstdc++ -#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 -#ifdef __clang__ -#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS) -#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 -#endif -#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT) -#define HALF_ENABLE_CPP11_CSTDINT 1 -#endif -#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH) -#define HALF_ENABLE_CPP11_CMATH 1 -#endif -#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH) -#define HALF_ENABLE_CPP11_HASH 1 -#endif -#else -#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT) -#define HALF_ENABLE_CPP11_CSTDINT 1 -#endif -#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH) -#define HALF_ENABLE_CPP11_CMATH 1 -#endif -#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH) -#define HALF_ENABLE_CPP11_HASH 1 -#endif -#endif -#endif -#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++ -#if _CPPLIB_VER >= 520 -#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS -#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 -#endif -#ifndef HALF_ENABLE_CPP11_CSTDINT -#define HALF_ENABLE_CPP11_CSTDINT 1 -#endif -#ifndef HALF_ENABLE_CPP11_HASH -#define HALF_ENABLE_CPP11_HASH 1 -#endif -#endif -#if _CPPLIB_VER >= 610 -#ifndef HALF_ENABLE_CPP11_CMATH -#define HALF_ENABLE_CPP11_CMATH 1 -#endif -#endif -#endif -#undef HALF_GNUC_VERSION - -// support constexpr -#if HALF_ENABLE_CPP11_CONSTEXPR -#define HALF_CONSTEXPR constexpr -#define HALF_CONSTEXPR_CONST constexpr -#else -#define HALF_CONSTEXPR -#define HALF_CONSTEXPR_CONST const -#endif - -// support noexcept -#if HALF_ENABLE_CPP11_NOEXCEPT -#define HALF_NOEXCEPT noexcept -#define HALF_NOTHROW noexcept -#else -#define HALF_NOEXCEPT -#define HALF_NOTHROW throw() -#endif - -#include -#include -#include -#include -#include -#include -#if HALF_ENABLE_CPP11_TYPE_TRAITS -#include -#endif -#if HALF_ENABLE_CPP11_CSTDINT -#include -#endif -#if HALF_ENABLE_CPP11_HASH -#include -#endif - -/// Default rounding mode. -/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as -/// well as for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including -/// half.hpp) to one of the standard rounding modes using their respective constants or the equivalent values of -/// `std::float_round_style`: -/// -/// `std::float_round_style` | value | rounding -/// ---------------------------------|-------|------------------------- -/// `std::round_indeterminate` | -1 | fastest (default) -/// `std::round_toward_zero` | 0 | toward zero -/// `std::round_to_nearest` | 1 | to nearest -/// `std::round_toward_infinity` | 2 | toward positive infinity -/// `std::round_toward_neg_infinity` | 3 | toward negative infinity -/// -/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with -/// overflows set to infinity) and is the fastest rounding mode possible. It can even be set to -/// `std::numeric_limits::round_style` to synchronize the rounding mode with that of the underlying -/// single-precision implementation. -#ifndef HALF_ROUND_STYLE -#define HALF_ROUND_STYLE 1 // = std::round_to_nearest -#endif - -/// Tie-breaking behaviour for round to nearest. -/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this -/// is defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way -/// cases (and thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more -/// IEEE-conformant behaviour is needed. -#ifndef HALF_ROUND_TIES_TO_EVEN -#define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero -#endif - -/// Value signaling overflow. -/// In correspondence with `HUGE_VAL[F|L]` from `` this symbol expands to a positive value signaling the overflow -/// of an operation, in particular it just evaluates to positive infinity. -#define HUGE_VALH std::numeric_limits::infinity() - -/// Fast half-precision fma function. -/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate -/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all -/// arithmetic operations, this is in fact always the case. -#define FP_FAST_FMAH 1 - -#ifndef FP_ILOGB0 -#define FP_ILOGB0 INT_MIN -#endif -#ifndef FP_ILOGBNAN -#define FP_ILOGBNAN INT_MAX -#endif -#ifndef FP_SUBNORMAL -#define FP_SUBNORMAL 0 -#endif -#ifndef FP_ZERO -#define FP_ZERO 1 -#endif -#ifndef FP_NAN -#define FP_NAN 2 -#endif -#ifndef FP_INFINITE -#define FP_INFINITE 3 -#endif -#ifndef FP_NORMAL -#define FP_NORMAL 4 -#endif - -/// Main namespace for half precision functionality. -/// This namespace contains all the functionality provided by the library. -namespace half_float -{ -class half; - -#if HALF_ENABLE_CPP11_USER_LITERALS -/// Library-defined half-precision literals. -/// Import this namespace to enable half-precision floating point literals: -/// ~~~~{.cpp} -/// using namespace half_float::literal; -/// half_float::half = 4.2_h; -/// ~~~~ -namespace literal -{ -half operator"" _h(long double); -} -#endif - -/// \internal -/// \brief Implementation details. -namespace detail -{ -#if HALF_ENABLE_CPP11_TYPE_TRAITS -/// Conditional type. -template -struct conditional : std::conditional -{ -}; - -/// Helper for tag dispatching. -template -struct bool_type : std::integral_constant -{ -}; -using std::false_type; -using std::true_type; - -/// Type traits for floating point types. -template -struct is_float : std::is_floating_point -{ -}; -#else -/// Conditional type. -template -struct conditional -{ - typedef T type; -}; -template -struct conditional -{ - typedef F type; -}; - -/// Helper for tag dispatching. -template -struct bool_type -{ -}; -typedef bool_type true_type; -typedef bool_type false_type; - -/// Type traits for floating point types. -template -struct is_float : false_type -{ -}; -template -struct is_float : is_float -{ -}; -template -struct is_float : is_float -{ -}; -template -struct is_float : is_float -{ -}; -template <> -struct is_float : true_type -{ -}; -template <> -struct is_float : true_type -{ -}; -template <> -struct is_float : true_type -{ -}; -#endif - -/// Type traits for floating point bits. -template -struct bits -{ - typedef unsigned char type; -}; -template -struct bits : bits -{ -}; -template -struct bits : bits -{ -}; -template -struct bits : bits -{ -}; - -#if HALF_ENABLE_CPP11_CSTDINT -/// Unsigned integer of (at least) 16 bits width. -typedef std::uint_least16_t uint16; - -/// Unsigned integer of (at least) 32 bits width. -template <> -struct bits -{ - typedef std::uint_least32_t type; -}; - -/// Unsigned integer of (at least) 64 bits width. -template <> -struct bits -{ - typedef std::uint_least64_t type; -}; -#else -/// Unsigned integer of (at least) 16 bits width. -typedef unsigned short uint16; - -/// Unsigned integer of (at least) 32 bits width. -template <> -struct bits : conditional::digits >= 32, unsigned int, unsigned long> -{ -}; - -#if HALF_ENABLE_CPP11_LONG_LONG -/// Unsigned integer of (at least) 64 bits width. -template <> -struct bits : conditional::digits >= 64, unsigned long, unsigned long long> -{ -}; -#else -/// Unsigned integer of (at least) 64 bits width. -template <> -struct bits -{ - typedef unsigned long type; -}; -#endif -#endif - -/// Tag type for binary construction. -struct binary_t -{ -}; - -/// Tag for binary construction. -HALF_CONSTEXPR_CONST binary_t binary = binary_t(); - -/// Temporary half-precision expression. -/// This class represents a half-precision expression which just stores a single-precision value internally. -struct expr -{ - /// Conversion constructor. - /// \param f single-precision value to convert - explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {} - - /// Conversion to single-precision. - /// \return single precision value representing expression value - HALF_CONSTEXPR operator float() const HALF_NOEXCEPT - { - return value_; - } - -private: - /// Internal expression value stored in single-precision. - float value_; -}; - -/// SFINAE helper for generic half-precision functions. -/// This class template has to be specialized for each valid combination of argument types to provide a corresponding -/// `type` member equivalent to \a T. -/// \tparam T type to return -template -struct enable -{ -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; -template -struct enable -{ - typedef T type; -}; - -/// Return type for specialized generic 2-argument half-precision functions. -/// This class template has to be specialized for each valid combination of argument types to provide a corresponding -/// `type` member denoting the appropriate return type. -/// \tparam T first argument type -/// \tparam U first argument type -template -struct result : enable -{ -}; -template <> -struct result -{ - typedef half type; -}; - -/// \name Classification helpers -/// \{ - -/// Check for infinity. -/// \tparam T argument type (builtin floating point type) -/// \param arg value to query -/// \retval true if infinity -/// \retval false else -template -bool builtin_isinf(T arg) -{ -#if HALF_ENABLE_CPP11_CMATH - return std::isinf(arg); -#elif defined(_MSC_VER) - return !::_finite(static_cast(arg)) && !::_isnan(static_cast(arg)); -#else - return arg == std::numeric_limits::infinity() || arg == -std::numeric_limits::infinity(); -#endif -} - -/// Check for NaN. -/// \tparam T argument type (builtin floating point type) -/// \param arg value to query -/// \retval true if not a number -/// \retval false else -template -bool builtin_isnan(T arg) -{ -#if HALF_ENABLE_CPP11_CMATH - return std::isnan(arg); -#elif defined(_MSC_VER) - return ::_isnan(static_cast(arg)) != 0; -#else - return arg != arg; -#endif -} - -/// Check sign. -/// \tparam T argument type (builtin floating point type) -/// \param arg value to query -/// \retval true if signbit set -/// \retval false else -template -bool builtin_signbit(T arg) -{ -#if HALF_ENABLE_CPP11_CMATH - return std::signbit(arg); -#else - return arg < T() || (arg == T() && T(1) / arg < T()); -#endif -} - -/// \} -/// \name Conversion -/// \{ - -/// Convert IEEE single-precision to half-precision. -/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \param value single-precision value -/// \return binary representation of half-precision value -template -uint16 float2half_impl(float value, true_type) -{ - typedef bits::type uint32; - uint32 bits; // = *reinterpret_cast(&value); //violating strict aliasing! - std::memcpy(&bits, &value, sizeof(float)); - /* uint16 hbits = (bits>>16) & 0x8000; - bits &= 0x7FFFFFFF; - int exp = bits >> 23; - if(exp == 255) - return hbits | 0x7C00 | (0x3FF&-static_cast((bits&0x7FFFFF)!=0)); - if(exp > 142) - { - if(R == std::round_toward_infinity) - return hbits | 0x7C00 - (hbits>>15); - if(R == std::round_toward_neg_infinity) - return hbits | 0x7BFF + (hbits>>15); - return hbits | 0x7BFF + (R!=std::round_toward_zero); - } - int g, s; - if(exp > 112) - { - g = (bits>>12) & 1; - s = (bits&0xFFF) != 0; - hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF); - } - else if(exp > 101) - { - int i = 125 - exp; - bits = (bits&0x7FFFFF) | 0x800000; - g = (bits>>i) & 1; - s = (bits&((1L<> (i+1); - } - else - { - g = 0; - s = bits != 0; - } - if(R == std::round_to_nearest) - #if HALF_ROUND_TIES_TO_EVEN - hbits += g & (s|hbits); - #else - hbits += g; - #endif - else if(R == std::round_toward_infinity) - hbits += ~(hbits>>15) & (s|g); - else if(R == std::round_toward_neg_infinity) - hbits += (hbits>>15) & (g|s); - */ - static const uint16 base_table[512] = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, - 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, - 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, - 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, - 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000, 0xC400, 0xC800, - 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00}; - static const unsigned char shift_table[512] = {24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13}; - uint16 hbits = base_table[bits >> 23] + static_cast((bits & 0x7FFFFF) >> shift_table[bits >> 23]); - if (R == std::round_to_nearest) - hbits += (((bits & 0x7FFFFF) >> (shift_table[bits >> 23] - 1)) | (((bits >> 23) & 0xFF) == 102)) - & ((hbits & 0x7C00) != 0x7C00) -#if HALF_ROUND_TIES_TO_EVEN - & (((((static_cast(1) << (shift_table[bits >> 23] - 1)) - 1) & bits) != 0) | hbits) -#endif - ; - else if (R == std::round_toward_zero) - hbits -= ((hbits & 0x7FFF) == 0x7C00) & ~shift_table[bits >> 23]; - else if (R == std::round_toward_infinity) - hbits += ((((bits & 0x7FFFFF & ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) - | (((bits >> 23) <= 102) & ((bits >> 23) != 0))) - & (hbits < 0x7C00)) - - ((hbits == 0xFC00) & ((bits >> 23) != 511)); - else if (R == std::round_toward_neg_infinity) - hbits += ((((bits & 0x7FFFFF & ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) - | (((bits >> 23) <= 358) & ((bits >> 23) != 256))) - & (hbits < 0xFC00) & (hbits >> 15)) - - ((hbits == 0x7C00) & ((bits >> 23) != 255)); - return hbits; -} - -/// Convert IEEE double-precision to half-precision. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \param value double-precision value -/// \return binary representation of half-precision value -template -uint16 float2half_impl(double value, true_type) -{ - typedef bits::type uint32; - typedef bits::type uint64; - uint64 bits; // = *reinterpret_cast(&value); //violating strict aliasing! - std::memcpy(&bits, &value, sizeof(double)); - uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF; - uint16 hbits = (hi >> 16) & 0x8000; - hi &= 0x7FFFFFFF; - int exp = hi >> 20; - if (exp == 2047) - return hbits | 0x7C00 | (0x3FF & -static_cast((bits & 0xFFFFFFFFFFFFF) != 0)); - if (exp > 1038) - { - if (R == std::round_toward_infinity) - return hbits | 0x7C00 - (hbits >> 15); - if (R == std::round_toward_neg_infinity) - return hbits | 0x7BFF + (hbits >> 15); - return hbits | 0x7BFF + (R != std::round_toward_zero); - } - int g, s = lo != 0; - if (exp > 1008) - { - g = (hi >> 9) & 1; - s |= (hi & 0x1FF) != 0; - hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF); - } - else if (exp > 997) - { - int i = 1018 - exp; - hi = (hi & 0xFFFFF) | 0x100000; - g = (hi >> i) & 1; - s |= (hi & ((1L << i) - 1)) != 0; - hbits |= hi >> (i + 1); - } - else - { - g = 0; - s |= hi != 0; - } - if (R == std::round_to_nearest) -#if HALF_ROUND_TIES_TO_EVEN - hbits += g & (s | hbits); -#else - hbits += g; -#endif - else if (R == std::round_toward_infinity) - hbits += ~(hbits >> 15) & (s | g); - else if (R == std::round_toward_neg_infinity) - hbits += (hbits >> 15) & (g | s); - return hbits; -} - -/// Convert non-IEEE floating point to half-precision. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \tparam T source type (builtin floating point type) -/// \param value floating point value -/// \return binary representation of half-precision value -template -uint16 float2half_impl(T value, ...) -{ - uint16 hbits = static_cast(builtin_signbit(value)) << 15; - if (value == T()) - return hbits; - if (builtin_isnan(value)) - return hbits | 0x7FFF; - if (builtin_isinf(value)) - return hbits | 0x7C00; - int exp; - std::frexp(value, &exp); - if (exp > 16) - { - if (R == std::round_toward_infinity) - return hbits | (0x7C00 - (hbits >> 15)); - else if (R == std::round_toward_neg_infinity) - return hbits | (0x7BFF + (hbits >> 15)); - return hbits | (0x7BFF + (R != std::round_toward_zero)); - } - if (exp < -13) - value = std::ldexp(value, 24); - else - { - value = std::ldexp(value, 11 - exp); - hbits |= ((exp + 13) << 10); - } - T ival, frac = std::modf(value, &ival); - hbits += static_cast(std::abs(static_cast(ival))); - if (R == std::round_to_nearest) - { - frac = std::abs(frac); -#if HALF_ROUND_TIES_TO_EVEN - hbits += (frac > T(0.5)) | ((frac == T(0.5)) & hbits); -#else - hbits += frac >= T(0.5); -#endif - } - else if (R == std::round_toward_infinity) - hbits += frac > T(); - else if (R == std::round_toward_neg_infinity) - hbits += frac < T(); - return hbits; -} - -/// Convert floating point to half-precision. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \tparam T source type (builtin floating point type) -/// \param value floating point value -/// \return binary representation of half-precision value -template -uint16 float2half(T value) -{ - return float2half_impl( - value, bool_type < std::numeric_limits::is_iec559 && sizeof(typename bits::type) == sizeof(T) > ()); -} - -/// Convert integer to half-precision floating point. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \tparam S `true` if value negative, `false` else -/// \tparam T type to convert (builtin integer type) -/// \param value non-negative integral value -/// \return binary representation of half-precision value -template -uint16 int2half_impl(T value) -{ -#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_integral::value, "int to half conversion only supports builtin integer types"); -#endif - if (S) - value = -value; - uint16 bits = S << 15; - if (value > 0xFFFF) - { - if (R == std::round_toward_infinity) - bits |= 0x7C00 - S; - else if (R == std::round_toward_neg_infinity) - bits |= 0x7BFF + S; - else - bits |= 0x7BFF + (R != std::round_toward_zero); - } - else if (value) - { - uint32_t m = value, exp = 24; - for (; m < 0x400; m <<= 1, --exp) - ; - for (; m > 0x7FF; m >>= 1, ++exp) - ; - bits |= (exp << 10) + m; - if (exp > 24) - { - if (R == std::round_to_nearest) - bits += (value >> (exp - 25)) & 1 -#if HALF_ROUND_TIES_TO_EVEN - & (((((1 << (exp - 25)) - 1) & value) != 0) | bits) -#endif - ; - else if (R == std::round_toward_infinity) - bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & !S; - else if (R == std::round_toward_neg_infinity) - bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & S; - } - } - return bits; -} - -/// Convert integer to half-precision floating point. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \tparam T type to convert (builtin integer type) -/// \param value integral value -/// \return binary representation of half-precision value -template -uint16 int2half(T value) -{ - return (value < 0) ? int2half_impl(value) : int2half_impl(value); -} - -/// Convert half-precision to IEEE single-precision. -/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). -/// \param value binary representation of half-precision value -/// \return single-precision value -inline float half2float_impl(uint16 value, float, true_type) -{ - typedef bits::type uint32; - /* uint32 bits = static_cast(value&0x8000) << 16; - int abs = value & 0x7FFF; - if(abs) - { - bits |= 0x38000000 << static_cast(abs>=0x7C00); - for(; abs<0x400; abs<<=1,bits-=0x800000) ; - bits += static_cast(abs) << 13; - } - */ - static const uint32 mantissa_table[2048] = {0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, - 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, - 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, - 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000, - 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, - 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000, 0x36440000, 0x36480000, - 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, - 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, - 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, - 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, - 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, - 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, - 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000, - 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, - 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, - 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, - 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, - 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, - 0x371F0000, 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, - 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 0x37300000, - 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, - 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, - 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, - 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, - 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, - 0x375E0000, 0x375F0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, - 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, - 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, - 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, - 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, - 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, - 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, - 0x378E8000, 0x378F0000, 0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, - 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, - 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, - 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000, - 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, - 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 0x37A80000, 0x37A88000, 0x37A90000, - 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, - 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, - 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, - 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, - 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, - 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, - 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 0x37C80000, 0x37C88000, - 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, - 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, - 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, - 0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, - 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, - 0x37DF8000, 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, - 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 0x37E80000, - 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, - 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000, - 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, - 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, - 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, - 0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, - 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, - 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, - 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000, - 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, - 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, - 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, - 0x380F4000, 0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, - 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, - 0x3813C000, 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, - 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000, - 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, - 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, - 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, - 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, - 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, - 0x38238000, 0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, - 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, - 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, - 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, - 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, - 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000, 0x3830C000, - 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, - 0x38334000, 0x38338000, 0x3833C000, 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, - 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, - 0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, - 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000, - 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, - 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000, 0x38408000, - 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, - 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, - 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, - 0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, - 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, - 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, - 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000, 0x38504000, - 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, - 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, - 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, - 0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, - 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, - 0x385BC000, 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, - 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 0x38600000, - 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, - 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, - 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, - 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, - 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, - 0x386B8000, 0x386BC000, 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, - 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, - 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, - 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, - 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, - 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000, 0x3878C000, - 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, - 0x387B4000, 0x387B8000, 0x387BC000, 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, - 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, - 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, - 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000, - 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, - 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000, 0x38042000, 0x38044000, - 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, - 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, - 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, - 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, - 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, - 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, - 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 0x380C0000, 0x380C2000, - 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, - 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, - 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, - 0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, - 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, - 0x3811E000, 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, - 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 0x38140000, - 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, - 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000, - 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, - 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, - 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, - 0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, - 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, - 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, - 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, - 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, - 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, - 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, - 0x3821A000, 0x3821C000, 0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, - 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, - 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, - 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, - 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, - 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 0x38280000, 0x38282000, 0x38284000, - 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, - 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, - 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, - 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, - 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, - 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, - 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000, - 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, - 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, - 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, - 0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, - 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, - 0x3835E000, 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, - 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 0x38380000, - 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, - 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000, - 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, - 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, - 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, - 0x383DC000, 0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, - 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, - 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, - 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000, - 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, - 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, - 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, - 0x3845A000, 0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, - 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, - 0x3847E000, 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, - 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000, - 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, - 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, - 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, - 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, - 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, - 0x384FC000, 0x384FE000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, - 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, - 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, - 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 0x38540000, 0x38542000, - 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, - 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, - 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, - 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, - 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, - 0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, - 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000, - 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, - 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000, 0x385E4000, - 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, - 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, - 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, - 0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, - 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, - 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, - 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000, 0x38662000, - 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, - 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, - 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, - 0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, - 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, - 0x386BE000, 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, - 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 0x386E0000, - 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, - 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000, - 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, - 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, - 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, - 0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, - 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, - 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, - 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, - 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, - 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, - 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, - 0x387BA000, 0x387BC000, 0x387BE000, 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, - 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, - 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, - 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000}; - static const uint32 exponent_table[64] = {0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, - 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, - 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, - 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000, - 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, - 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000, 0x88800000, 0x89000000, - 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, - 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000}; - static const unsigned short offset_table[64] = {0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, - 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, - 1024, 1024, 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, - 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024}; - uint32 bits = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10]; - // return *reinterpret_cast(&bits); //violating strict aliasing! - float out; - std::memcpy(&out, &bits, sizeof(float)); - return out; -} - -/// Convert half-precision to IEEE double-precision. -/// \param value binary representation of half-precision value -/// \return double-precision value -inline double half2float_impl(uint16 value, double, true_type) -{ - typedef bits::type uint32; - typedef bits::type uint64; - uint32 hi = static_cast(value & 0x8000) << 16; - int abs = value & 0x7FFF; - if (abs) - { - hi |= 0x3F000000 << static_cast(abs >= 0x7C00); - for (; abs < 0x400; abs <<= 1, hi -= 0x100000) - ; - hi += static_cast(abs) << 10; - } - uint64 bits = static_cast(hi) << 32; - // return *reinterpret_cast(&bits); //violating strict aliasing! - double out; - std::memcpy(&out, &bits, sizeof(double)); - return out; -} - -/// Convert half-precision to non-IEEE floating point. -/// \tparam T type to convert to (builtin integer type) -/// \param value binary representation of half-precision value -/// \return floating point value -template -T half2float_impl(uint16 value, T, ...) -{ - T out; - int abs = value & 0x7FFF; - if (abs > 0x7C00) - out = std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : T(); - else if (abs == 0x7C00) - out = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() : std::numeric_limits::max(); - else if (abs > 0x3FF) - out = std::ldexp(static_cast((abs & 0x3FF) | 0x400), (abs >> 10) - 25); - else - out = std::ldexp(static_cast(abs), -24); - return (value & 0x8000) ? -out : out; -} - -/// Convert half-precision to floating point. -/// \tparam T type to convert to (builtin integer type) -/// \param value binary representation of half-precision value -/// \return floating point value -template -T half2float(uint16 value) -{ - return half2float_impl( - value, T(), bool_type < std::numeric_limits::is_iec559 && sizeof(typename bits::type) == sizeof(T) > ()); -} - -/// Convert half-precision floating point to integer. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \tparam E `true` for round to even, `false` for round away from zero -/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign -/// bits) \param value binary representation of half-precision value \return integral value -template -T half2int_impl(uint16 value) -{ -#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_integral::value, "half to int conversion only supports builtin integer types"); -#endif - uint32_t e = value & 0x7FFF; - if (e >= 0x7C00) - return (value & 0x8000) ? std::numeric_limits::min() : std::numeric_limits::max(); - if (e < 0x3800) - { - if (R == std::round_toward_infinity) - return T(~(value >> 15) & (e != 0)); - else if (R == std::round_toward_neg_infinity) - return -T(value > 0x8000); - return T(); - } - uint32_t m = (value & 0x3FF) | 0x400; - e >>= 10; - if (e < 25) - { - if (R == std::round_to_nearest) - m += (1 << (24 - e)) - (~(m >> (25 - e)) & E); - else if (R == std::round_toward_infinity) - m += ((value >> 15) - 1) & ((1 << (25 - e)) - 1U); - else if (R == std::round_toward_neg_infinity) - m += -(value >> 15) & ((1 << (25 - e)) - 1U); - m >>= 25 - e; - } - else - m <<= e - 25; - return (value & 0x8000) ? -static_cast(m) : static_cast(m); -} - -/// Convert half-precision floating point to integer. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign -/// bits) \param value binary representation of half-precision value \return integral value -template -T half2int(uint16 value) -{ - return half2int_impl(value); -} - -/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero. -/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign -/// bits) \param value binary representation of half-precision value \return integral value -template -T half2int_up(uint16 value) -{ - return half2int_impl(value); -} - -/// Round half-precision number to nearest integer value. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \tparam E `true` for round to even, `false` for round away from zero -/// \param value binary representation of half-precision value -/// \return half-precision bits for nearest integral value -template -uint16 round_half_impl(uint16 value) -{ - uint32_t e = value & 0x7FFF; - uint16 result = value; - if (e < 0x3C00) - { - result &= 0x8000; - if (R == std::round_to_nearest) - result |= 0x3C00U & -(e >= (0x3800 + E)); - else if (R == std::round_toward_infinity) - result |= 0x3C00U & -(~(value >> 15) & (e != 0)); - else if (R == std::round_toward_neg_infinity) - result |= 0x3C00U & -(value > 0x8000); - } - else if (e < 0x6400) - { - e = 25 - (e >> 10); - uint32_t mask = (1 << e) - 1; - if (R == std::round_to_nearest) - result += (1 << (e - 1)) - (~(result >> e) & E); - else if (R == std::round_toward_infinity) - result += mask & ((value >> 15) - 1); - else if (R == std::round_toward_neg_infinity) - result += mask & -(value >> 15); - result &= ~mask; - } - return result; -} - -/// Round half-precision number to nearest integer value. -/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding -/// \param value binary representation of half-precision value -/// \return half-precision bits for nearest integral value -template -uint16 round_half(uint16 value) -{ - return round_half_impl(value); -} - -/// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero. -/// \param value binary representation of half-precision value -/// \return half-precision bits for nearest integral value -inline uint16 round_half_up(uint16 value) -{ - return round_half_impl(value); -} -/// \} - -struct functions; -template -struct unary_specialized; -template -struct binary_specialized; -template -struct half_caster; -} // namespace detail - -/// Half-precision floating point type. -/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and -/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and -/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations -/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to -/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic -/// expressions are kept in single-precision as long as possible (while of course still maintaining a strong -/// half-precision type). -/// -/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and -/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which -/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the -/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be -/// of exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will -/// most probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying -/// 16-bit IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 -/// bits if your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the -/// case on nearly any reasonable platform. -/// -/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable -/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation. -class half -{ - friend struct detail::functions; - friend struct detail::unary_specialized; - friend struct detail::binary_specialized; - template - friend struct detail::half_caster; - friend class std::numeric_limits; -#if HALF_ENABLE_CPP11_HASH - friend struct std::hash; -#endif -#if HALF_ENABLE_CPP11_USER_LITERALS - friend half literal::operator"" _h(long double); -#endif - -public: - /// Default constructor. - /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics - /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics. - HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {} - - /// Copy constructor. - /// \tparam T type of concrete half expression - /// \param rhs half expression to copy from - half(detail::expr rhs) - : data_(detail::float2half(static_cast(rhs))) - { - } - - /// Conversion constructor. - /// \param rhs float to convert - explicit half(float rhs) - : data_(detail::float2half(rhs)) - { - } - - /// Conversion to single-precision. - /// \return single precision value representing expression value - operator float() const - { - return detail::half2float(data_); - } - - /// Assignment operator. - /// \tparam T type of concrete half expression - /// \param rhs half expression to copy from - /// \return reference to this half - half& operator=(detail::expr rhs) - { - return *this = static_cast(rhs); - } - - /// Arithmetic assignment. - /// \tparam T type of concrete half expression - /// \param rhs half expression to add - /// \return reference to this half - template - typename detail::enable::type operator+=(T rhs) - { - return *this += static_cast(rhs); - } - - /// Arithmetic assignment. - /// \tparam T type of concrete half expression - /// \param rhs half expression to subtract - /// \return reference to this half - template - typename detail::enable::type operator-=(T rhs) - { - return *this -= static_cast(rhs); - } - - /// Arithmetic assignment. - /// \tparam T type of concrete half expression - /// \param rhs half expression to multiply with - /// \return reference to this half - template - typename detail::enable::type operator*=(T rhs) - { - return *this *= static_cast(rhs); - } - - /// Arithmetic assignment. - /// \tparam T type of concrete half expression - /// \param rhs half expression to divide by - /// \return reference to this half - template - typename detail::enable::type operator/=(T rhs) - { - return *this /= static_cast(rhs); - } - - /// Assignment operator. - /// \param rhs single-precision value to copy from - /// \return reference to this half - half& operator=(float rhs) - { - data_ = detail::float2half(rhs); - return *this; - } - - /// Arithmetic assignment. - /// \param rhs single-precision value to add - /// \return reference to this half - half& operator+=(float rhs) - { - data_ = detail::float2half(detail::half2float(data_) + rhs); - return *this; - } - - /// Arithmetic assignment. - /// \param rhs single-precision value to subtract - /// \return reference to this half - half& operator-=(float rhs) - { - data_ = detail::float2half(detail::half2float(data_) - rhs); - return *this; - } - - /// Arithmetic assignment. - /// \param rhs single-precision value to multiply with - /// \return reference to this half - half& operator*=(float rhs) - { - data_ = detail::float2half(detail::half2float(data_) * rhs); - return *this; - } - - /// Arithmetic assignment. - /// \param rhs single-precision value to divide by - /// \return reference to this half - half& operator/=(float rhs) - { - data_ = detail::float2half(detail::half2float(data_) / rhs); - return *this; - } - - /// Prefix increment. - /// \return incremented half value - half& operator++() - { - return *this += 1.0f; - } - - /// Prefix decrement. - /// \return decremented half value - half& operator--() - { - return *this -= 1.0f; - } - - /// Postfix increment. - /// \return non-incremented half value - half operator++(int) - { - half out(*this); - ++*this; - return out; - } - - /// Postfix decrement. - /// \return non-decremented half value - half operator--(int) - { - half out(*this); - --*this; - return out; - } - -private: - /// Rounding mode to use - static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE); - - /// Constructor. - /// \param bits binary representation to set half to - HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {} - - /// Internal binary representation - detail::uint16 data_; -}; - -#if HALF_ENABLE_CPP11_USER_LITERALS -namespace literal -{ -/// Half literal. -/// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due -/// to rather involved conversions. -/// \param value literal value -/// \return half with given value (if representable) -inline half operator"" _h(long double value) -{ - return half(detail::binary, detail::float2half(value)); -} -} // namespace literal -#endif - -namespace detail -{ -/// Wrapper implementing unspecialized half-precision functions. -struct functions -{ - /// Addition implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision sum stored in single-precision - static expr plus(float x, float y) - { - return expr(x + y); - } - - /// Subtraction implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision difference stored in single-precision - static expr minus(float x, float y) - { - return expr(x - y); - } - - /// Multiplication implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision product stored in single-precision - static expr multiplies(float x, float y) - { - return expr(x * y); - } - - /// Division implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision quotient stored in single-precision - static expr divides(float x, float y) - { - return expr(x / y); - } - - /// Output implementation. - /// \param out stream to write to - /// \param arg value to write - /// \return reference to stream - template - static std::basic_ostream& write(std::basic_ostream& out, float arg) - { - return out << arg; - } - - /// Input implementation. - /// \param in stream to read from - /// \param arg half to read into - /// \return reference to stream - template - static std::basic_istream& read(std::basic_istream& in, half& arg) - { - float f; - if (in >> f) - arg = f; - return in; - } - - /// Modulo implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision division remainder stored in single-precision - static expr fmod(float x, float y) - { - return expr(std::fmod(x, y)); - } - - /// Remainder implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision division remainder stored in single-precision - static expr remainder(float x, float y) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::remainder(x, y)); -#else - if (builtin_isnan(x) || builtin_isnan(y)) - return expr(std::numeric_limits::quiet_NaN()); - float ax = std::fabs(x), ay = std::fabs(y); - if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) - return expr(std::numeric_limits::quiet_NaN()); - if (ay >= 65536.0f) - return expr(x); - if (ax == ay) - return expr(builtin_signbit(x) ? -0.0f : 0.0f); - ax = std::fmod(ax, ay + ay); - float y2 = 0.5f * ay; - if (ax > y2) - { - ax -= ay; - if (ax >= y2) - ax -= ay; - } - return expr(builtin_signbit(x) ? -ax : ax); -#endif - } - - /// Remainder implementation. - /// \param x first operand - /// \param y second operand - /// \param quo address to store quotient bits at - /// \return Half-precision division remainder stored in single-precision - static expr remquo(float x, float y, int* quo) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::remquo(x, y, quo)); -#else - if (builtin_isnan(x) || builtin_isnan(y)) - return expr(std::numeric_limits::quiet_NaN()); - bool sign = builtin_signbit(x), qsign = static_cast(sign ^ builtin_signbit(y)); - float ax = std::fabs(x), ay = std::fabs(y); - if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) - return expr(std::numeric_limits::quiet_NaN()); - if (ay >= 65536.0f) - return expr(x); - if (ax == ay) - return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f); - ax = std::fmod(ax, 8.0f * ay); - int cquo = 0; - if (ax >= 4.0f * ay) - { - ax -= 4.0f * ay; - cquo += 4; - } - if (ax >= 2.0f * ay) - { - ax -= 2.0f * ay; - cquo += 2; - } - float y2 = 0.5f * ay; - if (ax > y2) - { - ax -= ay; - ++cquo; - if (ax >= y2) - { - ax -= ay; - ++cquo; - } - } - return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax); -#endif - } - - /// Positive difference implementation. - /// \param x first operand - /// \param y second operand - /// \return Positive difference stored in single-precision - static expr fdim(float x, float y) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::fdim(x, y)); -#else - return expr((x <= y) ? 0.0f : (x - y)); -#endif - } - - /// Fused multiply-add implementation. - /// \param x first operand - /// \param y second operand - /// \param z third operand - /// \return \a x * \a y + \a z stored in single-precision - static expr fma(float x, float y, float z) - { -#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF) - return expr(std::fma(x, y, z)); -#else - return expr(x * y + z); -#endif - } - - /// Get NaN. - /// \return Half-precision quiet NaN - static half nanh() - { - return half(binary, 0x7FFF); - } - - /// Exponential implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr exp(float arg) - { - return expr(std::exp(arg)); - } - - /// Exponential implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr expm1(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::expm1(arg)); -#else - return expr(static_cast(std::exp(static_cast(arg)) - 1.0)); -#endif - } - - /// Binary exponential implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr exp2(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::exp2(arg)); -#else - return expr(static_cast(std::exp(arg * 0.69314718055994530941723212145818))); -#endif - } - - /// Logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr log(float arg) - { - return expr(std::log(arg)); - } - - /// Common logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr log10(float arg) - { - return expr(std::log10(arg)); - } - - /// Logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr log1p(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::log1p(arg)); -#else - return expr(static_cast(std::log(1.0 + arg))); -#endif - } - - /// Binary logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr log2(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::log2(arg)); -#else - return expr(static_cast(std::log(static_cast(arg)) * 1.4426950408889634073599246810019)); -#endif - } - - /// Square root implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr sqrt(float arg) - { - return expr(std::sqrt(arg)); - } - - /// Cubic root implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr cbrt(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::cbrt(arg)); -#else - if (builtin_isnan(arg) || builtin_isinf(arg)) - return expr(arg); - return expr(builtin_signbit(arg) ? -static_cast(std::pow(-static_cast(arg), 1.0 / 3.0)) - : static_cast(std::pow(static_cast(arg), 1.0 / 3.0))); -#endif - } - - /// Hypotenuse implementation. - /// \param x first argument - /// \param y second argument - /// \return function value stored in single-preicision - static expr hypot(float x, float y) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::hypot(x, y)); -#else - return expr((builtin_isinf(x) || builtin_isinf(y)) - ? std::numeric_limits::infinity() - : static_cast(std::sqrt(static_cast(x) * x + static_cast(y) * y))); -#endif - } - - /// Power implementation. - /// \param base value to exponentiate - /// \param exp power to expontiate to - /// \return function value stored in single-preicision - static expr pow(float base, float exp) - { - return expr(std::pow(base, exp)); - } - - /// Sine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr sin(float arg) - { - return expr(std::sin(arg)); - } - - /// Cosine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr cos(float arg) - { - return expr(std::cos(arg)); - } - - /// Tan implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr tan(float arg) - { - return expr(std::tan(arg)); - } - - /// Arc sine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr asin(float arg) - { - return expr(std::asin(arg)); - } - - /// Arc cosine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr acos(float arg) - { - return expr(std::acos(arg)); - } - - /// Arc tangent implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr atan(float arg) - { - return expr(std::atan(arg)); - } - - /// Arc tangent implementation. - /// \param x first argument - /// \param y second argument - /// \return function value stored in single-preicision - static expr atan2(float x, float y) - { - return expr(std::atan2(x, y)); - } - - /// Hyperbolic sine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr sinh(float arg) - { - return expr(std::sinh(arg)); - } - - /// Hyperbolic cosine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr cosh(float arg) - { - return expr(std::cosh(arg)); - } - - /// Hyperbolic tangent implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr tanh(float arg) - { - return expr(std::tanh(arg)); - } - - /// Hyperbolic area sine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr asinh(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::asinh(arg)); -#else - return expr((arg == -std::numeric_limits::infinity()) - ? arg - : static_cast(std::log(arg + std::sqrt(arg * arg + 1.0)))); -#endif - } - - /// Hyperbolic area cosine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr acosh(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::acosh(arg)); -#else - return expr((arg < -1.0f) ? std::numeric_limits::quiet_NaN() - : static_cast(std::log(arg + std::sqrt(arg * arg - 1.0)))); -#endif - } - - /// Hyperbolic area tangent implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr atanh(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::atanh(arg)); -#else - return expr(static_cast(0.5 * std::log((1.0 + arg) / (1.0 - arg)))); -#endif - } - - /// Error function implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr erf(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::erf(arg)); -#else - return expr(static_cast(erf(static_cast(arg)))); -#endif - } - - /// Complementary implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr erfc(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::erfc(arg)); -#else - return expr(static_cast(1.0 - erf(static_cast(arg)))); -#endif - } - - /// Gamma logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr lgamma(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::lgamma(arg)); -#else - if (builtin_isinf(arg)) - return expr(std::numeric_limits::infinity()); - if (arg < 0.0f) - { - float i, f = std::modf(-arg, &i); - if (f == 0.0f) - return expr(std::numeric_limits::infinity()); - return expr(static_cast(1.1447298858494001741434273513531 - - std::log(std::abs(std::sin(3.1415926535897932384626433832795 * f))) - lgamma(1.0 - arg))); - } - return expr(static_cast(lgamma(static_cast(arg)))); -#endif - } - - /// Gamma implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr tgamma(float arg) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::tgamma(arg)); -#else - if (arg == 0.0f) - return builtin_signbit(arg) ? expr(-std::numeric_limits::infinity()) - : expr(std::numeric_limits::infinity()); - if (arg < 0.0f) - { - float i, f = std::modf(-arg, &i); - if (f == 0.0f) - return expr(std::numeric_limits::quiet_NaN()); - double value = 3.1415926535897932384626433832795 - / (std::sin(3.1415926535897932384626433832795 * f) * std::exp(lgamma(1.0 - arg))); - return expr(static_cast((std::fmod(i, 2.0f) == 0.0f) ? -value : value)); - } - if (builtin_isinf(arg)) - return expr(arg); - return expr(static_cast(std::exp(lgamma(static_cast(arg))))); -#endif - } - - /// Floor implementation. - /// \param arg value to round - /// \return rounded value - static half floor(half arg) - { - return half(binary, round_half(arg.data_)); - } - - /// Ceiling implementation. - /// \param arg value to round - /// \return rounded value - static half ceil(half arg) - { - return half(binary, round_half(arg.data_)); - } - - /// Truncation implementation. - /// \param arg value to round - /// \return rounded value - static half trunc(half arg) - { - return half(binary, round_half(arg.data_)); - } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static half round(half arg) - { - return half(binary, round_half_up(arg.data_)); - } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static long lround(half arg) - { - return detail::half2int_up(arg.data_); - } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static half rint(half arg) - { - return half(binary, round_half(arg.data_)); - } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static long lrint(half arg) - { - return detail::half2int(arg.data_); - } - -#if HALF_ENABLE_CPP11_LONG_LONG - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static long long llround(half arg) - { - return detail::half2int_up(arg.data_); - } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static long long llrint(half arg) - { - return detail::half2int(arg.data_); - } -#endif - - /// Decompression implementation. - /// \param arg number to decompress - /// \param exp address to store exponent at - /// \return normalized significant - static half frexp(half arg, int* exp) - { - int m = arg.data_ & 0x7FFF, e = -14; - if (m >= 0x7C00 || !m) - return *exp = 0, arg; - for (; m < 0x400; m <<= 1, --e) - ; - return *exp = e + (m >> 10), half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF)); - } - - /// Decompression implementation. - /// \param arg number to decompress - /// \param iptr address to store integer part at - /// \return fractional part - static half modf(half arg, half* iptr) - { - uint32_t e = arg.data_ & 0x7FFF; - if (e >= 0x6400) - return *iptr = arg, half(binary, arg.data_ & (0x8000U | -(e > 0x7C00))); - if (e < 0x3C00) - return iptr->data_ = arg.data_ & 0x8000, arg; - e >>= 10; - uint32_t mask = (1 << (25 - e)) - 1, m = arg.data_ & mask; - iptr->data_ = arg.data_ & ~mask; - if (!m) - return half(binary, arg.data_ & 0x8000); - for (; m < 0x400; m <<= 1, --e) - ; - return half(binary, static_cast((arg.data_ & 0x8000) | (e << 10) | (m & 0x3FF))); - } - - /// Scaling implementation. - /// \param arg number to scale - /// \param exp power of two to scale by - /// \return scaled number - static half scalbln(half arg, long exp) - { - uint32_t m = arg.data_ & 0x7FFF; - if (m >= 0x7C00 || !m) - return arg; - for (; m < 0x400; m <<= 1, --exp) - ; - exp += m >> 10; - uint16 value = arg.data_ & 0x8000; - if (exp > 30) - { - if (half::round_style == std::round_toward_zero) - value |= 0x7BFF; - else if (half::round_style == std::round_toward_infinity) - value |= 0x7C00 - (value >> 15); - else if (half::round_style == std::round_toward_neg_infinity) - value |= 0x7BFF + (value >> 15); - else - value |= 0x7C00; - } - else if (exp > 0) - value |= (exp << 10) | (m & 0x3FF); - else if (exp > -11) - { - m = (m & 0x3FF) | 0x400; - if (half::round_style == std::round_to_nearest) - { - m += 1 << -exp; -#if HALF_ROUND_TIES_TO_EVEN - m -= (m >> (1 - exp)) & 1; -#endif - } - else if (half::round_style == std::round_toward_infinity) - m += ((value >> 15) - 1) & ((1 << (1 - exp)) - 1U); - else if (half::round_style == std::round_toward_neg_infinity) - m += -(value >> 15) & ((1 << (1 - exp)) - 1U); - value |= m >> (1 - exp); - } - else if (half::round_style == std::round_toward_infinity) - value -= (value >> 15) - 1; - else if (half::round_style == std::round_toward_neg_infinity) - value += value >> 15; - return half(binary, value); - } - - /// Exponent implementation. - /// \param arg number to query - /// \return floating point exponent - static int ilogb(half arg) - { - int abs = arg.data_ & 0x7FFF; - if (!abs) - return FP_ILOGB0; - if (abs < 0x7C00) - { - int exp = (abs >> 10) - 15; - if (abs < 0x400) - for (; abs < 0x200; abs <<= 1, --exp) - ; - return exp; - } - if (abs > 0x7C00) - return FP_ILOGBNAN; - return INT_MAX; - } - - /// Exponent implementation. - /// \param arg number to query - /// \return floating point exponent - static half logb(half arg) - { - int abs = arg.data_ & 0x7FFF; - if (!abs) - return half(binary, 0xFC00); - if (abs < 0x7C00) - { - int exp = (abs >> 10) - 15; - if (abs < 0x400) - for (; abs < 0x200; abs <<= 1, --exp) - ; - uint16 bits = (exp < 0) << 15; - if (exp) - { - uint32_t m = std::abs(exp) << 6, e = 18; - for (; m < 0x400; m <<= 1, --e) - ; - bits |= (e << 10) + m; - } - return half(binary, bits); - } - if (abs > 0x7C00) - return arg; - return half(binary, 0x7C00); - } - - /// Enumeration implementation. - /// \param from number to increase/decrease - /// \param to direction to enumerate into - /// \return next representable number - static half nextafter(half from, half to) - { - uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF; - if (fabs > 0x7C00) - return from; - if (tabs > 0x7C00 || from.data_ == to.data_ || !(fabs | tabs)) - return to; - if (!fabs) - return half(binary, (to.data_ & 0x8000) + 1); - bool lt = ((fabs == from.data_) ? static_cast(fabs) : -static_cast(fabs)) - < ((tabs == to.data_) ? static_cast(tabs) : -static_cast(tabs)); - return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast(lt)) << 1) - 1); - } - - /// Enumeration implementation. - /// \param from number to increase/decrease - /// \param to direction to enumerate into - /// \return next representable number - static half nexttoward(half from, long double to) - { - if (isnan(from)) - return from; - long double lfrom = static_cast(from); - if (builtin_isnan(to) || lfrom == to) - return half(static_cast(to)); - if (!(from.data_ & 0x7FFF)) - return half(binary, (static_cast(builtin_signbit(to)) << 15) + 1); - return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast(lfrom < to)) << 1) - 1); - } - - /// Sign implementation - /// \param x first operand - /// \param y second operand - /// \return composed value - static half copysign(half x, half y) - { - return half(binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000)); - } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if infinite number - /// \retval false else - static int fpclassify(half arg) - { - uint32_t abs = arg.data_ & 0x7FFF; - return abs - ? ((abs > 0x3FF) ? ((abs >= 0x7C00) ? ((abs > 0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) : FP_SUBNORMAL) - : FP_ZERO; - } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if finite number - /// \retval false else - static bool isfinite(half arg) - { - return (arg.data_ & 0x7C00) != 0x7C00; - } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if infinite number - /// \retval false else - static bool isinf(half arg) - { - return (arg.data_ & 0x7FFF) == 0x7C00; - } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if not a number - /// \retval false else - static bool isnan(half arg) - { - return (arg.data_ & 0x7FFF) > 0x7C00; - } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if normal number - /// \retval false else - static bool isnormal(half arg) - { - return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00); - } - - /// Sign bit implementation. - /// \param arg value to check - /// \retval true if signed - /// \retval false if unsigned - static bool signbit(half arg) - { - return (arg.data_ & 0x8000) != 0; - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if operands equal - /// \retval false else - static bool isequal(half x, half y) - { - return (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF)) && !isnan(x); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if operands not equal - /// \retval false else - static bool isnotequal(half x, half y) - { - return (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF)) || isnan(x); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x > \a y - /// \retval false else - static bool isgreater(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - return xabs <= 0x7C00 && yabs <= 0x7C00 - && (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x >= \a y - /// \retval false else - static bool isgreaterequal(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - return xabs <= 0x7C00 && yabs <= 0x7C00 - && (((xabs == x.data_) ? xabs : -xabs) >= ((yabs == y.data_) ? yabs : -yabs)); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x < \a y - /// \retval false else - static bool isless(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - return xabs <= 0x7C00 && yabs <= 0x7C00 - && (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x <= \a y - /// \retval false else - static bool islessequal(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - return xabs <= 0x7C00 && yabs <= 0x7C00 - && (((xabs == x.data_) ? xabs : -xabs) <= ((yabs == y.data_) ? yabs : -yabs)); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if either \a x > \a y nor \a x < \a y - /// \retval false else - static bool islessgreater(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - if (xabs > 0x7C00 || yabs > 0x7C00) - return false; - int a = (xabs == x.data_) ? xabs : -xabs, b = (yabs == y.data_) ? yabs : -yabs; - return a < b || a > b; - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if operand unordered - /// \retval false else - static bool isunordered(half x, half y) - { - return isnan(x) || isnan(y); - } - -private: - static double erf(double arg) - { - if (builtin_isinf(arg)) - return (arg < 0.0) ? -1.0 : 1.0; - double x2 = arg * arg, ax2 = 0.147 * x2, - value = std::sqrt(1.0 - std::exp(-x2 * (1.2732395447351626861510701069801 + ax2) / (1.0 + ax2))); - return builtin_signbit(arg) ? -value : value; - } - - static double lgamma(double arg) - { - double v = 1.0; - for (; arg < 8.0; ++arg) - v *= arg; - double w = 1.0 / (arg * arg); - return (((((((-0.02955065359477124183006535947712 * w + 0.00641025641025641025641025641026) * w - + -0.00191752691752691752691752691753) - * w - + 8.4175084175084175084175084175084e-4) - * w - + -5.952380952380952380952380952381e-4) - * w - + 7.9365079365079365079365079365079e-4) - * w - + -0.00277777777777777777777777777778) - * w - + 0.08333333333333333333333333333333) - / arg - + 0.91893853320467274178032973640562 - std::log(v) - arg + (arg - 0.5) * std::log(arg); - } -}; - -/// Wrapper for unary half-precision functions needing specialization for individual argument types. -/// \tparam T argument type -template -struct unary_specialized -{ - /// Negation implementation. - /// \param arg value to negate - /// \return negated value - static HALF_CONSTEXPR half negate(half arg) - { - return half(binary, arg.data_ ^ 0x8000); - } - - /// Absolute value implementation. - /// \param arg function argument - /// \return absolute value - static half fabs(half arg) - { - return half(binary, arg.data_ & 0x7FFF); - } -}; -template <> -struct unary_specialized -{ - static HALF_CONSTEXPR expr negate(float arg) - { - return expr(-arg); - } - static expr fabs(float arg) - { - return expr(std::fabs(arg)); - } -}; - -/// Wrapper for binary half-precision functions needing specialization for individual argument types. -/// \tparam T first argument type -/// \tparam U first argument type -template -struct binary_specialized -{ - /// Minimum implementation. - /// \param x first operand - /// \param y second operand - /// \return minimum value - static expr fmin(float x, float y) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::fmin(x, y)); -#else - if (builtin_isnan(x)) - return expr(y); - if (builtin_isnan(y)) - return expr(x); - return expr(std::min(x, y)); -#endif - } - - /// Maximum implementation. - /// \param x first operand - /// \param y second operand - /// \return maximum value - static expr fmax(float x, float y) - { -#if HALF_ENABLE_CPP11_CMATH - return expr(std::fmax(x, y)); -#else - if (builtin_isnan(x)) - return expr(y); - if (builtin_isnan(y)) - return expr(x); - return expr(std::max(x, y)); -#endif - } -}; -template <> -struct binary_specialized -{ - static half fmin(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - if (xabs > 0x7C00) - return y; - if (yabs > 0x7C00) - return x; - return (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)) ? y : x; - } - static half fmax(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - if (xabs > 0x7C00) - return y; - if (yabs > 0x7C00) - return x; - return (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)) ? y : x; - } -}; - -/// Helper class for half casts. -/// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member -/// function and a corresponding `type` member denoting its return type. -/// \tparam T destination type -/// \tparam U source type -/// \tparam R rounding mode to use -template -struct half_caster -{ -}; -template -struct half_caster -{ -#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast from non-arithmetic type unsupported"); -#endif - - static half cast(U arg) - { - return cast_impl(arg, is_float()); - }; - -private: - static half cast_impl(U arg, true_type) - { - return half(binary, float2half(arg)); - } - static half cast_impl(U arg, false_type) - { - return half(binary, int2half(arg)); - } -}; -template -struct half_caster -{ -#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); -#endif - - static T cast(half arg) - { - return cast_impl(arg, is_float()); - } - -private: - static T cast_impl(half arg, true_type) - { - return half2float(arg.data_); - } - static T cast_impl(half arg, false_type) - { - return half2int(arg.data_); - } -}; -template -struct half_caster -{ -#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); -#endif - - static T cast(expr arg) - { - return cast_impl(arg, is_float()); - } - -private: - static T cast_impl(float arg, true_type) - { - return static_cast(arg); - } - static T cast_impl(half arg, false_type) - { - return half2int(arg.data_); - } -}; -template -struct half_caster -{ - static half cast(half arg) - { - return arg; - } -}; -template -struct half_caster : half_caster -{ -}; - -/// \name Comparison operators -/// \{ - -/// Comparison for equality. -/// \param x first operand -/// \param y second operand -/// \retval true if operands equal -/// \retval false else -template -typename enable::type operator==(T x, U y) -{ - return functions::isequal(x, y); -} - -/// Comparison for inequality. -/// \param x first operand -/// \param y second operand -/// \retval true if operands not equal -/// \retval false else -template -typename enable::type operator!=(T x, U y) -{ - return functions::isnotequal(x, y); -} - -/// Comparison for less than. -/// \param x first operand -/// \param y second operand -/// \retval true if \a x less than \a y -/// \retval false else -template -typename enable::type operator<(T x, U y) -{ - return functions::isless(x, y); -} - -/// Comparison for greater than. -/// \param x first operand -/// \param y second operand -/// \retval true if \a x greater than \a y -/// \retval false else -template -typename enable::type operator>(T x, U y) -{ - return functions::isgreater(x, y); -} - -/// Comparison for less equal. -/// \param x first operand -/// \param y second operand -/// \retval true if \a x less equal \a y -/// \retval false else -template -typename enable::type operator<=(T x, U y) -{ - return functions::islessequal(x, y); -} - -/// Comparison for greater equal. -/// \param x first operand -/// \param y second operand -/// \retval true if \a x greater equal \a y -/// \retval false else -template -typename enable::type operator>=(T x, U y) -{ - return functions::isgreaterequal(x, y); -} - -/// \} -/// \name Arithmetic operators -/// \{ - -/// Add halfs. -/// \param x left operand -/// \param y right operand -/// \return sum of half expressions -template -typename enable::type operator+(T x, U y) -{ - return functions::plus(x, y); -} - -/// Subtract halfs. -/// \param x left operand -/// \param y right operand -/// \return difference of half expressions -template -typename enable::type operator-(T x, U y) -{ - return functions::minus(x, y); -} - -/// Multiply halfs. -/// \param x left operand -/// \param y right operand -/// \return product of half expressions -template -typename enable::type operator*(T x, U y) -{ - return functions::multiplies(x, y); -} - -/// Divide halfs. -/// \param x left operand -/// \param y right operand -/// \return quotient of half expressions -template -typename enable::type operator/(T x, U y) -{ - return functions::divides(x, y); -} - -/// Identity. -/// \param arg operand -/// \return uncahnged operand -template -HALF_CONSTEXPR typename enable::type operator+(T arg) -{ - return arg; -} - -/// Negation. -/// \param arg operand -/// \return negated operand -template -HALF_CONSTEXPR typename enable::type operator-(T arg) -{ - return unary_specialized::negate(arg); -} - -/// \} -/// \name Input and output -/// \{ - -/// Output operator. -/// \param out output stream to write into -/// \param arg half expression to write -/// \return reference to output stream -template -typename enable&, T>::type operator<<(std::basic_ostream& out, T arg) -{ - return functions::write(out, arg); -} - -/// Input operator. -/// \param in input stream to read from -/// \param arg half to read into -/// \return reference to input stream -template -std::basic_istream& operator>>(std::basic_istream& in, half& arg) -{ - return functions::read(in, arg); -} - -/// \} -/// \name Basic mathematical operations -/// \{ - -/// Absolute value. -/// \param arg operand -/// \return absolute value of \a arg -// template typename enable::type abs(T arg) { return unary_specialized::fabs(arg); } -inline half abs(half arg) -{ - return unary_specialized::fabs(arg); -} -inline expr abs(expr arg) -{ - return unary_specialized::fabs(arg); -} - -/// Absolute value. -/// \param arg operand -/// \return absolute value of \a arg -// template typename enable::type fabs(T arg) { return unary_specialized::fabs(arg); } -inline half fabs(half arg) -{ - return unary_specialized::fabs(arg); -} -inline expr fabs(expr arg) -{ - return unary_specialized::fabs(arg); -} - -/// Remainder of division. -/// \param x first operand -/// \param y second operand -/// \return remainder of floating point division. -// template typename enable::type fmod(T x, U y) { return functions::fmod(x, y); } -inline expr fmod(half x, half y) -{ - return functions::fmod(x, y); -} -inline expr fmod(half x, expr y) -{ - return functions::fmod(x, y); -} -inline expr fmod(expr x, half y) -{ - return functions::fmod(x, y); -} -inline expr fmod(expr x, expr y) -{ - return functions::fmod(x, y); -} - -/// Remainder of division. -/// \param x first operand -/// \param y second operand -/// \return remainder of floating point division. -// template typename enable::type remainder(T x, U y) { return -// functions::remainder(x, y); } -inline expr remainder(half x, half y) -{ - return functions::remainder(x, y); -} -inline expr remainder(half x, expr y) -{ - return functions::remainder(x, y); -} -inline expr remainder(expr x, half y) -{ - return functions::remainder(x, y); -} -inline expr remainder(expr x, expr y) -{ - return functions::remainder(x, y); -} - -/// Remainder of division. -/// \param x first operand -/// \param y second operand -/// \param quo address to store some bits of quotient at -/// \return remainder of floating point division. -// template typename enable::type remquo(T x, U y, int *quo) { return -// functions::remquo(x, y, quo); } -inline expr remquo(half x, half y, int* quo) -{ - return functions::remquo(x, y, quo); -} -inline expr remquo(half x, expr y, int* quo) -{ - return functions::remquo(x, y, quo); -} -inline expr remquo(expr x, half y, int* quo) -{ - return functions::remquo(x, y, quo); -} -inline expr remquo(expr x, expr y, int* quo) -{ - return functions::remquo(x, y, quo); -} - -/// Fused multiply add. -/// \param x first operand -/// \param y second operand -/// \param z third operand -/// \return ( \a x * \a y ) + \a z rounded as one operation. -// template typename enable::type fma(T x, U y, V z) { return -// functions::fma(x, y, z); } -inline expr fma(half x, half y, half z) -{ - return functions::fma(x, y, z); -} -inline expr fma(half x, half y, expr z) -{ - return functions::fma(x, y, z); -} -inline expr fma(half x, expr y, half z) -{ - return functions::fma(x, y, z); -} -inline expr fma(half x, expr y, expr z) -{ - return functions::fma(x, y, z); -} -inline expr fma(expr x, half y, half z) -{ - return functions::fma(x, y, z); -} -inline expr fma(expr x, half y, expr z) -{ - return functions::fma(x, y, z); -} -inline expr fma(expr x, expr y, half z) -{ - return functions::fma(x, y, z); -} -inline expr fma(expr x, expr y, expr z) -{ - return functions::fma(x, y, z); -} - -/// Maximum of half expressions. -/// \param x first operand -/// \param y second operand -/// \return maximum of operands -// template typename result::type fmax(T x, U y) { return -// binary_specialized::fmax(x, y); } -inline half fmax(half x, half y) -{ - return binary_specialized::fmax(x, y); -} -inline expr fmax(half x, expr y) -{ - return binary_specialized::fmax(x, y); -} -inline expr fmax(expr x, half y) -{ - return binary_specialized::fmax(x, y); -} -inline expr fmax(expr x, expr y) -{ - return binary_specialized::fmax(x, y); -} - -/// Minimum of half expressions. -/// \param x first operand -/// \param y second operand -/// \return minimum of operands -// template typename result::type fmin(T x, U y) { return -// binary_specialized::fmin(x, y); } -inline half fmin(half x, half y) -{ - return binary_specialized::fmin(x, y); -} -inline expr fmin(half x, expr y) -{ - return binary_specialized::fmin(x, y); -} -inline expr fmin(expr x, half y) -{ - return binary_specialized::fmin(x, y); -} -inline expr fmin(expr x, expr y) -{ - return binary_specialized::fmin(x, y); -} - -/// Positive difference. -/// \param x first operand -/// \param y second operand -/// \return \a x - \a y or 0 if difference negative -// template typename enable::type fdim(T x, U y) { return functions::fdim(x, y); } -inline expr fdim(half x, half y) -{ - return functions::fdim(x, y); -} -inline expr fdim(half x, expr y) -{ - return functions::fdim(x, y); -} -inline expr fdim(expr x, half y) -{ - return functions::fdim(x, y); -} -inline expr fdim(expr x, expr y) -{ - return functions::fdim(x, y); -} - -/// Get NaN value. -/// \return quiet NaN -inline half nanh(const char*) -{ - return functions::nanh(); -} - -/// \} -/// \name Exponential functions -/// \{ - -/// Exponential function. -/// \param arg function argument -/// \return e raised to \a arg -// template typename enable::type exp(T arg) { return functions::exp(arg); } -inline expr exp(half arg) -{ - return functions::exp(arg); -} -inline expr exp(expr arg) -{ - return functions::exp(arg); -} - -/// Exponential minus one. -/// \param arg function argument -/// \return e raised to \a arg subtracted by 1 -// template typename enable::type expm1(T arg) { return functions::expm1(arg); } -inline expr expm1(half arg) -{ - return functions::expm1(arg); -} -inline expr expm1(expr arg) -{ - return functions::expm1(arg); -} - -/// Binary exponential. -/// \param arg function argument -/// \return 2 raised to \a arg -// template typename enable::type exp2(T arg) { return functions::exp2(arg); } -inline expr exp2(half arg) -{ - return functions::exp2(arg); -} -inline expr exp2(expr arg) -{ - return functions::exp2(arg); -} - -/// Natural logorithm. -/// \param arg function argument -/// \return logarithm of \a arg to base e -// template typename enable::type log(T arg) { return functions::log(arg); } -inline expr log(half arg) -{ - return functions::log(arg); -} -inline expr log(expr arg) -{ - return functions::log(arg); -} - -/// Common logorithm. -/// \param arg function argument -/// \return logarithm of \a arg to base 10 -// template typename enable::type log10(T arg) { return functions::log10(arg); } -inline expr log10(half arg) -{ - return functions::log10(arg); -} -inline expr log10(expr arg) -{ - return functions::log10(arg); -} - -/// Natural logorithm. -/// \param arg function argument -/// \return logarithm of \a arg plus 1 to base e -// template typename enable::type log1p(T arg) { return functions::log1p(arg); } -inline expr log1p(half arg) -{ - return functions::log1p(arg); -} -inline expr log1p(expr arg) -{ - return functions::log1p(arg); -} - -/// Binary logorithm. -/// \param arg function argument -/// \return logarithm of \a arg to base 2 -// template typename enable::type log2(T arg) { return functions::log2(arg); } -inline expr log2(half arg) -{ - return functions::log2(arg); -} -inline expr log2(expr arg) -{ - return functions::log2(arg); -} - -/// \} -/// \name Power functions -/// \{ - -/// Square root. -/// \param arg function argument -/// \return square root of \a arg -// template typename enable::type sqrt(T arg) { return functions::sqrt(arg); } -inline expr sqrt(half arg) -{ - return functions::sqrt(arg); -} -inline expr sqrt(expr arg) -{ - return functions::sqrt(arg); -} - -/// Cubic root. -/// \param arg function argument -/// \return cubic root of \a arg -// template typename enable::type cbrt(T arg) { return functions::cbrt(arg); } -inline expr cbrt(half arg) -{ - return functions::cbrt(arg); -} -inline expr cbrt(expr arg) -{ - return functions::cbrt(arg); -} - -/// Hypotenuse function. -/// \param x first argument -/// \param y second argument -/// \return square root of sum of squares without internal over- or underflows -// template typename enable::type hypot(T x, U y) { return functions::hypot(x, y); -//} -inline expr hypot(half x, half y) -{ - return functions::hypot(x, y); -} -inline expr hypot(half x, expr y) -{ - return functions::hypot(x, y); -} -inline expr hypot(expr x, half y) -{ - return functions::hypot(x, y); -} -inline expr hypot(expr x, expr y) -{ - return functions::hypot(x, y); -} - -/// Power function. -/// \param base first argument -/// \param exp second argument -/// \return \a base raised to \a exp -// template typename enable::type pow(T base, U exp) { return functions::pow(base, -// exp); } -inline expr pow(half base, half exp) -{ - return functions::pow(base, exp); -} -inline expr pow(half base, expr exp) -{ - return functions::pow(base, exp); -} -inline expr pow(expr base, half exp) -{ - return functions::pow(base, exp); -} -inline expr pow(expr base, expr exp) -{ - return functions::pow(base, exp); -} - -/// \} -/// \name Trigonometric functions -/// \{ - -/// Sine function. -/// \param arg function argument -/// \return sine value of \a arg -// template typename enable::type sin(T arg) { return functions::sin(arg); } -inline expr sin(half arg) -{ - return functions::sin(arg); -} -inline expr sin(expr arg) -{ - return functions::sin(arg); -} - -/// Cosine function. -/// \param arg function argument -/// \return cosine value of \a arg -// template typename enable::type cos(T arg) { return functions::cos(arg); } -inline expr cos(half arg) -{ - return functions::cos(arg); -} -inline expr cos(expr arg) -{ - return functions::cos(arg); -} - -/// Tangent function. -/// \param arg function argument -/// \return tangent value of \a arg -// template typename enable::type tan(T arg) { return functions::tan(arg); } -inline expr tan(half arg) -{ - return functions::tan(arg); -} -inline expr tan(expr arg) -{ - return functions::tan(arg); -} - -/// Arc sine. -/// \param arg function argument -/// \return arc sine value of \a arg -// template typename enable::type asin(T arg) { return functions::asin(arg); } -inline expr asin(half arg) -{ - return functions::asin(arg); -} -inline expr asin(expr arg) -{ - return functions::asin(arg); -} - -/// Arc cosine function. -/// \param arg function argument -/// \return arc cosine value of \a arg -// template typename enable::type acos(T arg) { return functions::acos(arg); } -inline expr acos(half arg) -{ - return functions::acos(arg); -} -inline expr acos(expr arg) -{ - return functions::acos(arg); -} - -/// Arc tangent function. -/// \param arg function argument -/// \return arc tangent value of \a arg -// template typename enable::type atan(T arg) { return functions::atan(arg); } -inline expr atan(half arg) -{ - return functions::atan(arg); -} -inline expr atan(expr arg) -{ - return functions::atan(arg); -} - -/// Arc tangent function. -/// \param x first argument -/// \param y second argument -/// \return arc tangent value -// template typename enable::type atan2(T x, U y) { return functions::atan2(x, y); -//} -inline expr atan2(half x, half y) -{ - return functions::atan2(x, y); -} -inline expr atan2(half x, expr y) -{ - return functions::atan2(x, y); -} -inline expr atan2(expr x, half y) -{ - return functions::atan2(x, y); -} -inline expr atan2(expr x, expr y) -{ - return functions::atan2(x, y); -} - -/// \} -/// \name Hyperbolic functions -/// \{ - -/// Hyperbolic sine. -/// \param arg function argument -/// \return hyperbolic sine value of \a arg -// template typename enable::type sinh(T arg) { return functions::sinh(arg); } -inline expr sinh(half arg) -{ - return functions::sinh(arg); -} -inline expr sinh(expr arg) -{ - return functions::sinh(arg); -} - -/// Hyperbolic cosine. -/// \param arg function argument -/// \return hyperbolic cosine value of \a arg -// template typename enable::type cosh(T arg) { return functions::cosh(arg); } -inline expr cosh(half arg) -{ - return functions::cosh(arg); -} -inline expr cosh(expr arg) -{ - return functions::cosh(arg); -} - -/// Hyperbolic tangent. -/// \param arg function argument -/// \return hyperbolic tangent value of \a arg -// template typename enable::type tanh(T arg) { return functions::tanh(arg); } -inline expr tanh(half arg) -{ - return functions::tanh(arg); -} -inline expr tanh(expr arg) -{ - return functions::tanh(arg); -} - -/// Hyperbolic area sine. -/// \param arg function argument -/// \return area sine value of \a arg -// template typename enable::type asinh(T arg) { return functions::asinh(arg); } -inline expr asinh(half arg) -{ - return functions::asinh(arg); -} -inline expr asinh(expr arg) -{ - return functions::asinh(arg); -} - -/// Hyperbolic area cosine. -/// \param arg function argument -/// \return area cosine value of \a arg -// template typename enable::type acosh(T arg) { return functions::acosh(arg); } -inline expr acosh(half arg) -{ - return functions::acosh(arg); -} -inline expr acosh(expr arg) -{ - return functions::acosh(arg); -} - -/// Hyperbolic area tangent. -/// \param arg function argument -/// \return area tangent value of \a arg -// template typename enable::type atanh(T arg) { return functions::atanh(arg); } -inline expr atanh(half arg) -{ - return functions::atanh(arg); -} -inline expr atanh(expr arg) -{ - return functions::atanh(arg); -} - -/// \} -/// \name Error and gamma functions -/// \{ - -/// Error function. -/// \param arg function argument -/// \return error function value of \a arg -// template typename enable::type erf(T arg) { return functions::erf(arg); } -inline expr erf(half arg) -{ - return functions::erf(arg); -} -inline expr erf(expr arg) -{ - return functions::erf(arg); -} - -/// Complementary error function. -/// \param arg function argument -/// \return 1 minus error function value of \a arg -// template typename enable::type erfc(T arg) { return functions::erfc(arg); } -inline expr erfc(half arg) -{ - return functions::erfc(arg); -} -inline expr erfc(expr arg) -{ - return functions::erfc(arg); -} - -/// Natural logarithm of gamma function. -/// \param arg function argument -/// \return natural logarith of gamma function for \a arg -// template typename enable::type lgamma(T arg) { return functions::lgamma(arg); } -inline expr lgamma(half arg) -{ - return functions::lgamma(arg); -} -inline expr lgamma(expr arg) -{ - return functions::lgamma(arg); -} - -/// Gamma function. -/// \param arg function argument -/// \return gamma function value of \a arg -// template typename enable::type tgamma(T arg) { return functions::tgamma(arg); } -inline expr tgamma(half arg) -{ - return functions::tgamma(arg); -} -inline expr tgamma(expr arg) -{ - return functions::tgamma(arg); -} - -/// \} -/// \name Rounding -/// \{ - -/// Nearest integer not less than half value. -/// \param arg half to round -/// \return nearest integer not less than \a arg -// template typename enable::type ceil(T arg) { return functions::ceil(arg); } -inline half ceil(half arg) -{ - return functions::ceil(arg); -} -inline half ceil(expr arg) -{ - return functions::ceil(arg); -} - -/// Nearest integer not greater than half value. -/// \param arg half to round -/// \return nearest integer not greater than \a arg -// template typename enable::type floor(T arg) { return functions::floor(arg); } -inline half floor(half arg) -{ - return functions::floor(arg); -} -inline half floor(expr arg) -{ - return functions::floor(arg); -} - -/// Nearest integer not greater in magnitude than half value. -/// \param arg half to round -/// \return nearest integer not greater in magnitude than \a arg -// template typename enable::type trunc(T arg) { return functions::trunc(arg); } -inline half trunc(half arg) -{ - return functions::trunc(arg); -} -inline half trunc(expr arg) -{ - return functions::trunc(arg); -} - -/// Nearest integer. -/// \param arg half to round -/// \return nearest integer, rounded away from zero in half-way cases -// template typename enable::type round(T arg) { return functions::round(arg); } -inline half round(half arg) -{ - return functions::round(arg); -} -inline half round(expr arg) -{ - return functions::round(arg); -} - -/// Nearest integer. -/// \param arg half to round -/// \return nearest integer, rounded away from zero in half-way cases -// template typename enable::type lround(T arg) { return functions::lround(arg); } -inline long lround(half arg) -{ - return functions::lround(arg); -} -inline long lround(expr arg) -{ - return functions::lround(arg); -} - -/// Nearest integer using half's internal rounding mode. -/// \param arg half expression to round -/// \return nearest integer using default rounding mode -// template typename enable::type nearbyint(T arg) { return functions::nearbyint(arg); } -inline half nearbyint(half arg) -{ - return functions::rint(arg); -} -inline half nearbyint(expr arg) -{ - return functions::rint(arg); -} - -/// Nearest integer using half's internal rounding mode. -/// \param arg half expression to round -/// \return nearest integer using default rounding mode -// template typename enable::type rint(T arg) { return functions::rint(arg); } -inline half rint(half arg) -{ - return functions::rint(arg); -} -inline half rint(expr arg) -{ - return functions::rint(arg); -} - -/// Nearest integer using half's internal rounding mode. -/// \param arg half expression to round -/// \return nearest integer using default rounding mode -// template typename enable::type lrint(T arg) { return functions::lrint(arg); } -inline long lrint(half arg) -{ - return functions::lrint(arg); -} -inline long lrint(expr arg) -{ - return functions::lrint(arg); -} -#if HALF_ENABLE_CPP11_LONG_LONG -/// Nearest integer. -/// \param arg half to round -/// \return nearest integer, rounded away from zero in half-way cases -// template typename enable::type llround(T arg) { return functions::llround(arg); } -inline long long llround(half arg) -{ - return functions::llround(arg); -} -inline long long llround(expr arg) -{ - return functions::llround(arg); -} - -/// Nearest integer using half's internal rounding mode. -/// \param arg half expression to round -/// \return nearest integer using default rounding mode -// template typename enable::type llrint(T arg) { return functions::llrint(arg); } -inline long long llrint(half arg) -{ - return functions::llrint(arg); -} -inline long long llrint(expr arg) -{ - return functions::llrint(arg); -} -#endif - -/// \} -/// \name Floating point manipulation -/// \{ - -/// Decompress floating point number. -/// \param arg number to decompress -/// \param exp address to store exponent at -/// \return significant in range [0.5, 1) -// template typename enable::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); } -inline half frexp(half arg, int* exp) -{ - return functions::frexp(arg, exp); -} -inline half frexp(expr arg, int* exp) -{ - return functions::frexp(arg, exp); -} - -/// Multiply by power of two. -/// \param arg number to modify -/// \param exp power of two to multiply with -/// \return \a arg multplied by 2 raised to \a exp -// template typename enable::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); -//} -inline half ldexp(half arg, int exp) -{ - return functions::scalbln(arg, exp); -} -inline half ldexp(expr arg, int exp) -{ - return functions::scalbln(arg, exp); -} - -/// Extract integer and fractional parts. -/// \param arg number to decompress -/// \param iptr address to store integer part at -/// \return fractional part -// template typename enable::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); -//} -inline half modf(half arg, half* iptr) -{ - return functions::modf(arg, iptr); -} -inline half modf(expr arg, half* iptr) -{ - return functions::modf(arg, iptr); -} - -/// Multiply by power of two. -/// \param arg number to modify -/// \param exp power of two to multiply with -/// \return \a arg multplied by 2 raised to \a exp -// template typename enable::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); -//} -inline half scalbn(half arg, int exp) -{ - return functions::scalbln(arg, exp); -} -inline half scalbn(expr arg, int exp) -{ - return functions::scalbln(arg, exp); -} - -/// Multiply by power of two. -/// \param arg number to modify -/// \param exp power of two to multiply with -/// \return \a arg multplied by 2 raised to \a exp -// template typename enable::type scalbln(T arg, long exp) { return functions::scalbln(arg, -// exp); -//} -inline half scalbln(half arg, long exp) -{ - return functions::scalbln(arg, exp); -} -inline half scalbln(expr arg, long exp) -{ - return functions::scalbln(arg, exp); -} - -/// Extract exponent. -/// \param arg number to query -/// \return floating point exponent -/// \retval FP_ILOGB0 for zero -/// \retval FP_ILOGBNAN for NaN -/// \retval MAX_INT for infinity -// template typename enable::type ilogb(T arg) { return functions::ilogb(arg); } -inline int ilogb(half arg) -{ - return functions::ilogb(arg); -} -inline int ilogb(expr arg) -{ - return functions::ilogb(arg); -} - -/// Extract exponent. -/// \param arg number to query -/// \return floating point exponent -// template typename enable::type logb(T arg) { return functions::logb(arg); } -inline half logb(half arg) -{ - return functions::logb(arg); -} -inline half logb(expr arg) -{ - return functions::logb(arg); -} - -/// Next representable value. -/// \param from value to compute next representable value for -/// \param to direction towards which to compute next value -/// \return next representable value after \a from in direction towards \a to -// template typename enable::type nextafter(T from, U to) { return -// functions::nextafter(from, to); } -inline half nextafter(half from, half to) -{ - return functions::nextafter(from, to); -} -inline half nextafter(half from, expr to) -{ - return functions::nextafter(from, to); -} -inline half nextafter(expr from, half to) -{ - return functions::nextafter(from, to); -} -inline half nextafter(expr from, expr to) -{ - return functions::nextafter(from, to); -} - -/// Next representable value. -/// \param from value to compute next representable value for -/// \param to direction towards which to compute next value -/// \return next representable value after \a from in direction towards \a to -// template typename enable::type nexttoward(T from, long double to) { return -// functions::nexttoward(from, to); } -inline half nexttoward(half from, long double to) -{ - return functions::nexttoward(from, to); -} -inline half nexttoward(expr from, long double to) -{ - return functions::nexttoward(from, to); -} - -/// Take sign. -/// \param x value to change sign for -/// \param y value to take sign from -/// \return value equal to \a x in magnitude and to \a y in sign -// template typename enable::type copysign(T x, U y) { return -// functions::copysign(x, y); } -inline half copysign(half x, half y) -{ - return functions::copysign(x, y); -} -inline half copysign(half x, expr y) -{ - return functions::copysign(x, y); -} -inline half copysign(expr x, half y) -{ - return functions::copysign(x, y); -} -inline half copysign(expr x, expr y) -{ - return functions::copysign(x, y); -} - -/// \} -/// \name Floating point classification -/// \{ - -/// Classify floating point value. -/// \param arg number to classify -/// \retval FP_ZERO for positive and negative zero -/// \retval FP_SUBNORMAL for subnormal numbers -/// \retval FP_INFINITY for positive and negative infinity -/// \retval FP_NAN for NaNs -/// \retval FP_NORMAL for all other (normal) values -// template typename enable::type fpclassify(T arg) { return functions::fpclassify(arg); } -inline int fpclassify(half arg) -{ - return functions::fpclassify(arg); -} -inline int fpclassify(expr arg) -{ - return functions::fpclassify(arg); -} - -/// Check if finite number. -/// \param arg number to check -/// \retval true if neither infinity nor NaN -/// \retval false else -// template typename enable::type isfinite(T arg) { return functions::isfinite(arg); } -inline bool isfinite(half arg) -{ - return functions::isfinite(arg); -} -inline bool isfinite(expr arg) -{ - return functions::isfinite(arg); -} - -/// Check for infinity. -/// \param arg number to check -/// \retval true for positive or negative infinity -/// \retval false else -// template typename enable::type isinf(T arg) { return functions::isinf(arg); } -inline bool isinf(half arg) -{ - return functions::isinf(arg); -} -inline bool isinf(expr arg) -{ - return functions::isinf(arg); -} - -/// Check for NaN. -/// \param arg number to check -/// \retval true for NaNs -/// \retval false else -// template typename enable::type isnan(T arg) { return functions::isnan(arg); } -inline bool isnan(half arg) -{ - return functions::isnan(arg); -} -inline bool isnan(expr arg) -{ - return functions::isnan(arg); -} - -/// Check if normal number. -/// \param arg number to check -/// \retval true if normal number -/// \retval false if either subnormal, zero, infinity or NaN -// template typename enable::type isnormal(T arg) { return functions::isnormal(arg); } -inline bool isnormal(half arg) -{ - return functions::isnormal(arg); -} -inline bool isnormal(expr arg) -{ - return functions::isnormal(arg); -} - -/// Check sign. -/// \param arg number to check -/// \retval true for negative number -/// \retval false for positive number -// template typename enable::type signbit(T arg) { return functions::signbit(arg); } -inline bool signbit(half arg) -{ - return functions::signbit(arg); -} -inline bool signbit(expr arg) -{ - return functions::signbit(arg); -} - -/// \} -/// \name Comparison -/// \{ - -/// Comparison for greater than. -/// \param x first operand -/// \param y second operand -/// \retval true if \a x greater than \a y -/// \retval false else -// template typename enable::type isgreater(T x, U y) { return -// functions::isgreater(x, y); } -inline bool isgreater(half x, half y) -{ - return functions::isgreater(x, y); -} -inline bool isgreater(half x, expr y) -{ - return functions::isgreater(x, y); -} -inline bool isgreater(expr x, half y) -{ - return functions::isgreater(x, y); -} -inline bool isgreater(expr x, expr y) -{ - return functions::isgreater(x, y); -} - -/// Comparison for greater equal. -/// \param x first operand -/// \param y second operand -/// \retval true if \a x greater equal \a y -/// \retval false else -// template typename enable::type isgreaterequal(T x, U y) { return -// functions::isgreaterequal(x, y); } -inline bool isgreaterequal(half x, half y) -{ - return functions::isgreaterequal(x, y); -} -inline bool isgreaterequal(half x, expr y) -{ - return functions::isgreaterequal(x, y); -} -inline bool isgreaterequal(expr x, half y) -{ - return functions::isgreaterequal(x, y); -} -inline bool isgreaterequal(expr x, expr y) -{ - return functions::isgreaterequal(x, y); -} - -/// Comparison for less than. -/// \param x first operand -/// \param y second operand -/// \retval true if \a x less than \a y -/// \retval false else -// template typename enable::type isless(T x, U y) { return functions::isless(x, -// y); -//} -inline bool isless(half x, half y) -{ - return functions::isless(x, y); -} -inline bool isless(half x, expr y) -{ - return functions::isless(x, y); -} -inline bool isless(expr x, half y) -{ - return functions::isless(x, y); -} -inline bool isless(expr x, expr y) -{ - return functions::isless(x, y); -} - -/// Comparison for less equal. -/// \param x first operand -/// \param y second operand -/// \retval true if \a x less equal \a y -/// \retval false else -// template typename enable::type islessequal(T x, U y) { return -// functions::islessequal(x, y); } -inline bool islessequal(half x, half y) -{ - return functions::islessequal(x, y); -} -inline bool islessequal(half x, expr y) -{ - return functions::islessequal(x, y); -} -inline bool islessequal(expr x, half y) -{ - return functions::islessequal(x, y); -} -inline bool islessequal(expr x, expr y) -{ - return functions::islessequal(x, y); -} - -/// Comarison for less or greater. -/// \param x first operand -/// \param y second operand -/// \retval true if either less or greater -/// \retval false else -// template typename enable::type islessgreater(T x, U y) { return -// functions::islessgreater(x, y); } -inline bool islessgreater(half x, half y) -{ - return functions::islessgreater(x, y); -} -inline bool islessgreater(half x, expr y) -{ - return functions::islessgreater(x, y); -} -inline bool islessgreater(expr x, half y) -{ - return functions::islessgreater(x, y); -} -inline bool islessgreater(expr x, expr y) -{ - return functions::islessgreater(x, y); -} - -/// Check if unordered. -/// \param x first operand -/// \param y second operand -/// \retval true if unordered (one or two NaN operands) -/// \retval false else -// template typename enable::type isunordered(T x, U y) { return -// functions::isunordered(x, y); } -inline bool isunordered(half x, half y) -{ - return functions::isunordered(x, y); -} -inline bool isunordered(half x, expr y) -{ - return functions::isunordered(x, y); -} -inline bool isunordered(expr x, half y) -{ - return functions::isunordered(x, y); -} -inline bool isunordered(expr x, expr y) -{ - return functions::isunordered(x, y); -} - -/// \name Casting -/// \{ - -/// Cast to or from half-precision floating point number. -/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted -/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. -/// It uses the default rounding mode. -/// -/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types -/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler -/// error and casting between [half](\ref half_float::half)s is just a no-op. -/// \tparam T destination type (half or built-in arithmetic type) -/// \tparam U source type (half or built-in arithmetic type) -/// \param arg value to cast -/// \return \a arg converted to destination type -template -T half_cast(U arg) -{ - return half_caster::cast(arg); -} - -/// Cast to or from half-precision floating point number. -/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted -/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. -/// -/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types -/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler -/// error and casting between [half](\ref half_float::half)s is just a no-op. -/// \tparam T destination type (half or built-in arithmetic type) -/// \tparam R rounding mode to use. -/// \tparam U source type (half or built-in arithmetic type) -/// \param arg value to cast -/// \return \a arg converted to destination type -template -T half_cast(U arg) -{ - return half_caster::cast(arg); -} -/// \} -} // namespace detail - -using detail::operator==; -using detail::operator!=; -using detail::operator<; -using detail::operator>; -using detail::operator<=; -using detail::operator>=; -using detail::operator+; -using detail::operator-; -using detail::operator*; -using detail::operator/; -using detail::operator<<; -using detail::operator>>; - -using detail::abs; -using detail::acos; -using detail::acosh; -using detail::asin; -using detail::asinh; -using detail::atan; -using detail::atan2; -using detail::atanh; -using detail::cbrt; -using detail::ceil; -using detail::cos; -using detail::cosh; -using detail::erf; -using detail::erfc; -using detail::exp; -using detail::exp2; -using detail::expm1; -using detail::fabs; -using detail::fdim; -using detail::floor; -using detail::fma; -using detail::fmax; -using detail::fmin; -using detail::fmod; -using detail::hypot; -using detail::lgamma; -using detail::log; -using detail::log10; -using detail::log1p; -using detail::log2; -using detail::lrint; -using detail::lround; -using detail::nanh; -using detail::nearbyint; -using detail::pow; -using detail::remainder; -using detail::remquo; -using detail::rint; -using detail::round; -using detail::sin; -using detail::sinh; -using detail::sqrt; -using detail::tan; -using detail::tanh; -using detail::tgamma; -using detail::trunc; -#if HALF_ENABLE_CPP11_LONG_LONG -using detail::llrint; -using detail::llround; -#endif -using detail::copysign; -using detail::fpclassify; -using detail::frexp; -using detail::ilogb; -using detail::isfinite; -using detail::isgreater; -using detail::isgreaterequal; -using detail::isinf; -using detail::isless; -using detail::islessequal; -using detail::islessgreater; -using detail::isnan; -using detail::isnormal; -using detail::isunordered; -using detail::ldexp; -using detail::logb; -using detail::modf; -using detail::nextafter; -using detail::nexttoward; -using detail::scalbln; -using detail::scalbn; -using detail::signbit; - -using detail::half_cast; -} // namespace half_float - -/// Extensions to the C++ standard library. -namespace std -{ -/// Numeric limits for half-precision floats. -/// Because of the underlying single-precision implementation of many operations, it inherits some properties from -/// `std::numeric_limits`. -template <> -class numeric_limits : public numeric_limits -{ -public: - /// Supports signed values. - static HALF_CONSTEXPR_CONST bool is_signed = true; - - /// Is not exact. - static HALF_CONSTEXPR_CONST bool is_exact = false; - - /// Doesn't provide modulo arithmetic. - static HALF_CONSTEXPR_CONST bool is_modulo = false; - - /// IEEE conformant. - static HALF_CONSTEXPR_CONST bool is_iec559 = true; - - /// Supports infinity. - static HALF_CONSTEXPR_CONST bool has_infinity = true; - - /// Supports quiet NaNs. - static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true; - - /// Supports subnormal values. - static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present; - - /// Rounding mode. - /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying - /// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding - /// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the - /// single-precision rounding mode. - static HALF_CONSTEXPR_CONST float_round_style round_style - = (std::numeric_limits::round_style == half_float::half::round_style) ? half_float::half::round_style - : round_indeterminate; - - /// Significant digits. - static HALF_CONSTEXPR_CONST int digits = 11; - - /// Significant decimal digits. - static HALF_CONSTEXPR_CONST int digits10 = 3; - - /// Required decimal digits to represent all possible values. - static HALF_CONSTEXPR_CONST int max_digits10 = 5; - - /// Number base. - static HALF_CONSTEXPR_CONST int radix = 2; - - /// One more than smallest exponent. - static HALF_CONSTEXPR_CONST int min_exponent = -13; - - /// Smallest normalized representable power of 10. - static HALF_CONSTEXPR_CONST int min_exponent10 = -4; - - /// One more than largest exponent - static HALF_CONSTEXPR_CONST int max_exponent = 16; - - /// Largest finitely representable power of 10. - static HALF_CONSTEXPR_CONST int max_exponent10 = 4; - - /// Smallest positive normal value. - static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, 0x0400); - } - - /// Smallest finite value. - static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, 0xFBFF); - } - - /// Largest finite value. - static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, 0x7BFF); - } - - /// Difference between one and next representable value. - static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, 0x1400); - } - - /// Maximum rounding error. - static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00); - } - - /// Positive infinity. - static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, 0x7C00); - } - - /// Quiet NaN. - static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, 0x7FFF); - } - - /// Signalling NaN. - static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, 0x7DFF); - } - - /// Smallest positive subnormal value. - static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW - { - return half_float::half(half_float::detail::binary, 0x0001); - } -}; - -#if HALF_ENABLE_CPP11_HASH -/// Hash function for half-precision floats. -/// This is only defined if C++11 `std::hash` is supported and enabled. -template <> -struct hash //: unary_function -{ - /// Type of function argument. - typedef half_float::half argument_type; - - /// Function return type. - typedef size_t result_type; - - /// Compute hash function. - /// \param arg half to hash - /// \return hash value - result_type operator()(argument_type arg) const - { - return hash()(static_cast(arg.data_) & -(arg.data_ != 0x8000)); - } -}; -#endif -} // namespace std - -#undef HALF_CONSTEXPR -#undef HALF_CONSTEXPR_CONST -#undef HALF_NOEXCEPT -#undef HALF_NOTHROW -#ifdef HALF_POP_WARNINGS -#pragma warning(pop) -#undef HALF_POP_WARNINGS -#endif - -#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp b/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp deleted file mode 100644 index 03c64398..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/logger.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "logger.h" -#include "ErrorRecorder.h" -#include "logging.h" - -SampleErrorRecorder gRecorder; -namespace sample -{ -Logger gLogger{Logger::Severity::kINFO}; -LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLogger)}; -LogStreamConsumer gLogInfo{LOG_INFO(gLogger)}; -LogStreamConsumer gLogWarning{LOG_WARN(gLogger)}; -LogStreamConsumer gLogError{LOG_ERROR(gLogger)}; -LogStreamConsumer gLogFatal{LOG_FATAL(gLogger)}; - -void setReportableSeverity(Logger::Severity severity) -{ - gLogger.setReportableSeverity(severity); - gLogVerbose.setReportableSeverity(severity); - gLogInfo.setReportableSeverity(severity); - gLogWarning.setReportableSeverity(severity); - gLogError.setReportableSeverity(severity); - gLogFatal.setReportableSeverity(severity); -} -} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logger.h b/src/Detector/tensorrt_yolo/common_deprecated/logger.h deleted file mode 100644 index 3069e8e9..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/logger.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LOGGER_H -#define LOGGER_H - -#include "logging.h" - -class SampleErrorRecorder; -extern SampleErrorRecorder gRecorder; -namespace sample -{ -extern Logger gLogger; -extern LogStreamConsumer gLogVerbose; -extern LogStreamConsumer gLogInfo; -extern LogStreamConsumer gLogWarning; -extern LogStreamConsumer gLogError; -extern LogStreamConsumer gLogFatal; - -void setReportableSeverity(Logger::Severity severity); -} // namespace sample - -#endif // LOGGER_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/logging.h b/src/Detector/tensorrt_yolo/common_deprecated/logging.h deleted file mode 100644 index 78732c10..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/logging.h +++ /dev/null @@ -1,578 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TENSORRT_LOGGING_H -#define TENSORRT_LOGGING_H - -#include "NvInferRuntimeCommon.h" -#include "sampleOptions.h" -#include -#include -#include -#include -#include -#include -#include -#include - -namespace sample -{ - -using Severity = nvinfer1::ILogger::Severity; - -class LogStreamConsumerBuffer : public std::stringbuf -{ -public: - LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) - : mOutput(stream) - , mPrefix(prefix) - , mShouldLog(shouldLog) - { - } - - LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept - : mOutput(other.mOutput) - , mPrefix(other.mPrefix) - , mShouldLog(other.mShouldLog) - { - } - LogStreamConsumerBuffer(const LogStreamConsumerBuffer& other) = delete; - LogStreamConsumerBuffer() = delete; - LogStreamConsumerBuffer& operator=(const LogStreamConsumerBuffer&) = delete; - LogStreamConsumerBuffer& operator=(LogStreamConsumerBuffer&&) = delete; - - ~LogStreamConsumerBuffer() override - { - // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence - // std::streambuf::pptr() gives a pointer to the current position of the output sequence - // if the pointer to the beginning is not equal to the pointer to the current position, - // call putOutput() to log the output to the stream - if (pbase() != pptr()) - { - putOutput(); - } - } - - //! - //! synchronizes the stream buffer and returns 0 on success - //! synchronizing the stream buffer consists of inserting the buffer contents into the stream, - //! resetting the buffer and flushing the stream - //! - int32_t sync() override - { - putOutput(); - return 0; - } - - void putOutput() - { - if (mShouldLog) - { - // prepend timestamp - std::time_t timestamp = std::time(nullptr); - tm* tm_local = std::localtime(×tamp); - mOutput << "["; - mOutput << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; - mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; - mOutput << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; - mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; - mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; - mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; - // std::stringbuf::str() gets the string contents of the buffer - // insert the buffer contents pre-appended by the appropriate prefix into the stream - mOutput << mPrefix << str(); - } - // set the buffer to empty - str(""); - // flush the stream - mOutput.flush(); - } - - void setShouldLog(bool shouldLog) - { - mShouldLog = shouldLog; - } - -private: - std::ostream& mOutput; - std::string mPrefix; - bool mShouldLog{}; -}; // class LogStreamConsumerBuffer - -//! -//! \class LogStreamConsumerBase -//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer -//! -class LogStreamConsumerBase -{ -public: - LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) - : mBuffer(stream, prefix, shouldLog) - { - } - -protected: - std::mutex mLogMutex; - LogStreamConsumerBuffer mBuffer; -}; // class LogStreamConsumerBase - -//! -//! \class LogStreamConsumer -//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. -//! Order of base classes is LogStreamConsumerBase and then std::ostream. -//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field -//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. -//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. -//! Please do not change the order of the parent classes. -//! -class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream -{ -public: - //! - //! \brief Creates a LogStreamConsumer which logs messages with level severity. - //! Reportable severity determines if the messages are severe enough to be logged. - //! - LogStreamConsumer(nvinfer1::ILogger::Severity reportableSeverity, nvinfer1::ILogger::Severity severity) - : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) - , std::ostream(&mBuffer) // links the stream buffer with the stream - , mShouldLog(severity <= reportableSeverity) - , mSeverity(severity) - { - } - - LogStreamConsumer(LogStreamConsumer&& other) noexcept - : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) - , std::ostream(&mBuffer) // links the stream buffer with the stream - , mShouldLog(other.mShouldLog) - , mSeverity(other.mSeverity) - { - } - LogStreamConsumer(const LogStreamConsumer& other) = delete; - LogStreamConsumer() = delete; - ~LogStreamConsumer() = default; - LogStreamConsumer& operator=(const LogStreamConsumer&) = delete; - LogStreamConsumer& operator=(LogStreamConsumer&&) = delete; - - void setReportableSeverity(Severity reportableSeverity) - { - mShouldLog = mSeverity <= reportableSeverity; - mBuffer.setShouldLog(mShouldLog); - } - - std::mutex& getMutex() - { - return mLogMutex; - } - - bool getShouldLog() const - { - return mShouldLog; - } - -private: - static std::ostream& severityOstream(Severity severity) - { - return severity >= Severity::kINFO ? std::cout : std::cerr; - } - - static std::string severityPrefix(Severity severity) - { - switch (severity) - { - case Severity::kINTERNAL_ERROR: return "[F] "; - case Severity::kERROR: return "[E] "; - case Severity::kWARNING: return "[W] "; - case Severity::kINFO: return "[I] "; - case Severity::kVERBOSE: return "[V] "; - default: assert(0); return ""; - } - } - - bool mShouldLog; - Severity mSeverity; -}; // class LogStreamConsumer - -template -LogStreamConsumer& operator<<(LogStreamConsumer& logger, const T& obj) -{ - if (logger.getShouldLog()) - { - std::lock_guard guard(logger.getMutex()); - auto& os = static_cast(logger); - os << obj; - } - return logger; -} - -//! -//! Special handling std::endl -//! -inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, std::ostream& (*f)(std::ostream&) ) -{ - if (logger.getShouldLog()) - { - std::lock_guard guard(logger.getMutex()); - auto& os = static_cast(logger); - os << f; - } - return logger; -} - -inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, const nvinfer1::Dims& dims) -{ - if (logger.getShouldLog()) - { - std::lock_guard guard(logger.getMutex()); - auto& os = static_cast(logger); - for (int32_t i = 0; i < dims.nbDims; ++i) - { - os << (i ? "x" : "") << dims.d[i]; - } - } - return logger; -} - -//! -//! \class Logger -//! -//! \brief Class which manages logging of TensorRT tools and samples -//! -//! \details This class provides a common interface for TensorRT tools and samples to log information to the console, -//! and supports logging two types of messages: -//! -//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) -//! - Test pass/fail messages -//! -//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is -//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. -//! -//! In the future, this class could be extended to support dumping test results to a file in some standard format -//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). -//! -//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger -//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT -//! library and messages coming from the sample. -//! -//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the -//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger -//! object. -//! -class Logger : public nvinfer1::ILogger -{ -public: - explicit Logger(Severity severity = Severity::kWARNING) - : mReportableSeverity(severity) - { - } - - //! - //! \enum TestResult - //! \brief Represents the state of a given test - //! - enum class TestResult - { - kRUNNING, //!< The test is running - kPASSED, //!< The test passed - kFAILED, //!< The test failed - kWAIVED //!< The test was waived - }; - - //! - //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger - //! \return The nvinfer1::ILogger associated with this Logger - //! - //! TODO Once all samples are updated to use this method to register the logger with TensorRT, - //! we can eliminate the inheritance of Logger from ILogger - //! - nvinfer1::ILogger& getTRTLogger() noexcept - { - return *this; - } - - //! - //! \brief Implementation of the nvinfer1::ILogger::log() virtual method - //! - //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the - //! inheritance from nvinfer1::ILogger - //! - void log(Severity severity, const char* msg) noexcept override - { - LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; - } - - //! - //! \brief Method for controlling the verbosity of logging output - //! - //! \param severity The logger will only emit messages that have severity of this level or higher. - //! - void setReportableSeverity(Severity severity) noexcept - { - mReportableSeverity = severity; - } - - //! - //! \brief Opaque handle that holds logging information for a particular test - //! - //! This object is an opaque handle to information used by the Logger to print test results. - //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used - //! with Logger::reportTest{Start,End}(). - //! - class TestAtom - { - public: - TestAtom(TestAtom&&) = default; - - private: - friend class Logger; - - TestAtom(bool started, const std::string& name, const std::string& cmdline) - : mStarted(started) - , mName(name) - , mCmdline(cmdline) - { - } - - bool mStarted; - std::string mName; - std::string mCmdline; - }; - - //! - //! \brief Define a test for logging - //! - //! \param[in] name The name of the test. This should be a string starting with - //! "TensorRT" and containing dot-separated strings containing - //! the characters [A-Za-z0-9_]. - //! For example, "TensorRT.sample_googlenet" - //! \param[in] cmdline The command line used to reproduce the test - // - //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). - //! - static TestAtom defineTest(const std::string& name, const std::string& cmdline) - { - return TestAtom(false, name, cmdline); - } - - //! - //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments - //! as input - //! - //! \param[in] name The name of the test - //! \param[in] argc The number of command-line arguments - //! \param[in] argv The array of command-line arguments (given as C strings) - //! - //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). - //! - static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv) - { - // Append TensorRT version as info - const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]"; - auto cmdline = genCmdlineString(argc, argv); - return defineTest(vname, cmdline); - } - - //! - //! \brief Report that a test has started. - //! - //! \pre reportTestStart() has not been called yet for the given testAtom - //! - //! \param[in] testAtom The handle to the test that has started - //! - static void reportTestStart(TestAtom& testAtom) - { - reportTestResult(testAtom, TestResult::kRUNNING); - assert(!testAtom.mStarted); - testAtom.mStarted = true; - } - - //! - //! \brief Report that a test has ended. - //! - //! \pre reportTestStart() has been called for the given testAtom - //! - //! \param[in] testAtom The handle to the test that has ended - //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, - //! TestResult::kFAILED, TestResult::kWAIVED - //! - static void reportTestEnd(TestAtom const& testAtom, TestResult result) - { - assert(result != TestResult::kRUNNING); - assert(testAtom.mStarted); - reportTestResult(testAtom, result); - } - - static int32_t reportPass(TestAtom const& testAtom) - { - reportTestEnd(testAtom, TestResult::kPASSED); - return EXIT_SUCCESS; - } - - static int32_t reportFail(TestAtom const& testAtom) - { - reportTestEnd(testAtom, TestResult::kFAILED); - return EXIT_FAILURE; - } - - static int32_t reportWaive(TestAtom const& testAtom) - { - reportTestEnd(testAtom, TestResult::kWAIVED); - return EXIT_SUCCESS; - } - - static int32_t reportTest(TestAtom const& testAtom, bool pass) - { - return pass ? reportPass(testAtom) : reportFail(testAtom); - } - - Severity getReportableSeverity() const - { - return mReportableSeverity; - } - -private: - //! - //! \brief returns an appropriate string for prefixing a log message with the given severity - //! - static const char* severityPrefix(Severity severity) - { - switch (severity) - { - case Severity::kINTERNAL_ERROR: return "[F] "; - case Severity::kERROR: return "[E] "; - case Severity::kWARNING: return "[W] "; - case Severity::kINFO: return "[I] "; - case Severity::kVERBOSE: return "[V] "; - default: assert(0); return ""; - } - } - - //! - //! \brief returns an appropriate string for prefixing a test result message with the given result - //! - static const char* testResultString(TestResult result) - { - switch (result) - { - case TestResult::kRUNNING: return "RUNNING"; - case TestResult::kPASSED: return "PASSED"; - case TestResult::kFAILED: return "FAILED"; - case TestResult::kWAIVED: return "WAIVED"; - default: assert(0); return ""; - } - } - - //! - //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity - //! - static std::ostream& severityOstream(Severity severity) - { - return severity >= Severity::kINFO ? std::cout : std::cerr; - } - - //! - //! \brief method that implements logging test results - //! - static void reportTestResult(TestAtom const& testAtom, TestResult result) - { - severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " - << testAtom.mCmdline << std::endl; - } - - //! - //! \brief generate a command line string from the given (argc, argv) values - //! - static std::string genCmdlineString(int32_t argc, char const* const* argv) - { - std::stringstream ss; - for (int32_t i = 0; i < argc; i++) - { - if (i > 0) - { - ss << " "; - } - ss << argv[i]; - } - return ss.str(); - } - - Severity mReportableSeverity; -}; // class Logger - -namespace -{ -//! -//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE -//! -//! Example usage: -//! -//! LOG_VERBOSE(logger) << "hello world" << std::endl; -//! -inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) -{ - return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); -} - -//! -//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO -//! -//! Example usage: -//! -//! LOG_INFO(logger) << "hello world" << std::endl; -//! -inline LogStreamConsumer LOG_INFO(const Logger& logger) -{ - return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); -} - -//! -//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING -//! -//! Example usage: -//! -//! LOG_WARN(logger) << "hello world" << std::endl; -//! -inline LogStreamConsumer LOG_WARN(const Logger& logger) -{ - return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); -} - -//! -//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR -//! -//! Example usage: -//! -//! LOG_ERROR(logger) << "hello world" << std::endl; -//! -inline LogStreamConsumer LOG_ERROR(const Logger& logger) -{ - return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); -} - -//! -//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR -//! ("fatal" severity) -//! -//! Example usage: -//! -//! LOG_FATAL(logger) << "hello world" << std::endl; -//! -inline LogStreamConsumer LOG_FATAL(const Logger& logger) -{ - return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); -} -} // anonymous namespace -} // namespace sample -#endif // TENSORRT_LOGGING_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h b/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h deleted file mode 100644 index c92a1420..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/parserOnnxConfig.h +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PARSER_ONNX_CONFIG_H -#define PARSER_ONNX_CONFIG_H - -#include -#include -#include - -#include "NvInfer.h" -#include "NvOnnxConfig.h" -#include "NvOnnxParser.h" - -#define ONNX_DEBUG 1 - -/** - * \class ParserOnnxConfig - * \brief Configuration Manager Class Concrete Implementation - * - * \note: - * - */ - -using namespace std; - -class ParserOnnxConfig : public nvonnxparser::IOnnxConfig -{ - -protected: - string mModelFilename{}; - string mTextFilename{}; - string mFullTextFilename{}; - nvinfer1::DataType mModelDtype; - nvonnxparser::IOnnxConfig::Verbosity mVerbosity; - bool mPrintLayercInfo; - -public: - ParserOnnxConfig() - : mModelDtype(nvinfer1::DataType::kFLOAT) - , mVerbosity(static_cast(nvinfer1::ILogger::Severity::kWARNING)) - , mPrintLayercInfo(false) - { -#ifdef ONNX_DEBUG - if (isDebug()) - { - std::cout << " ParserOnnxConfig::ctor(): " << this << "\t" << std::endl; - } -#endif - } - -protected: - ~ParserOnnxConfig() - { -#ifdef ONNX_DEBUG - if (isDebug()) - { - std::cout << "ParserOnnxConfig::dtor(): " << this << std::endl; - } -#endif - } - -public: - virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept - { - mModelDtype = modelDtype; - } - - virtual nvinfer1::DataType getModelDtype() const noexcept - { - return mModelDtype; - } - - virtual const char* getModelFileName() const noexcept - { - return mModelFilename.c_str(); - } - virtual void setModelFileName(const char* onnxFilename) noexcept - { - mModelFilename = string(onnxFilename); - } - virtual nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept - { - return mVerbosity; - } - virtual void addVerbosity() noexcept - { - ++mVerbosity; - } - virtual void reduceVerbosity() noexcept - { - --mVerbosity; - } - virtual void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept - { - mVerbosity = verbosity; - } - - virtual const char* getTextFileName() const noexcept - { - return mTextFilename.c_str(); - } - virtual void setTextFileName(const char* textFilename) noexcept - { - mTextFilename = string(textFilename); - } - virtual const char* getFullTextFileName() const noexcept - { - return mFullTextFilename.c_str(); - } - virtual void setFullTextFileName(const char* fullTextFilename) noexcept - { - mFullTextFilename = string(fullTextFilename); - } - virtual bool getPrintLayerInfo() const noexcept - { - return mPrintLayercInfo; - } - virtual void setPrintLayerInfo(bool src) noexcept - { - mPrintLayercInfo = src; - } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() - - virtual bool isDebug() const noexcept - { -#if ONNX_DEBUG - return (std::getenv("ONNX_DEBUG") ? true : false); -#else - return false; -#endif - } - - virtual void destroy() noexcept - { - delete this; - } - -}; // class ParserOnnxConfig - -#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h b/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h deleted file mode 100644 index 3d84b095..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/safeCommon.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TENSORRT_SAFE_COMMON_H -#define TENSORRT_SAFE_COMMON_H - -#include "NvInferRuntimeCommon.h" -#include -#include -#include -#include -#include - -#define CHECK(status) \ - do \ - { \ - auto ret = (status); \ - if (ret != 0) \ - { \ - std::cerr << "Cuda failure: " << ret << std::endl; \ - abort(); \ - } \ - } while (0) - -namespace samplesCommon -{ -template -inline std::shared_ptr infer_object(T* obj) -{ - if (!obj) - { - throw std::runtime_error("Failed to create object"); - } - return std::shared_ptr(obj); -} - -inline uint32_t elementSize(nvinfer1::DataType t) -{ - switch (t) - { - case nvinfer1::DataType::kINT32: - case nvinfer1::DataType::kFLOAT: return 4; - case nvinfer1::DataType::kHALF: return 2; - case nvinfer1::DataType::kINT8: return 1; - case nvinfer1::DataType::kBOOL: return 1; - } - return 0; -} - -template -inline A divUp(A x, B n) -{ - return (x + n - 1) / n; -} - -} // namespace samplesCommon - -#endif // TENSORRT_SAFE_COMMON_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h deleted file mode 100644 index 53a78331..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleConfig.h +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SampleConfig_H -#define SampleConfig_H - -#include -#include -#include - -#include "NvInfer.h" -#include "NvOnnxConfig.h" -class SampleConfig : public nvonnxparser::IOnnxConfig -{ -public: - enum class InputDataFormat : int - { - kASCII = 0, - kPPM = 1 - }; - -private: - std::string mModelFilename; - std::string mEngineFilename; - std::string mTextFilename; - std::string mFullTextFilename; - std::string mImageFilename; - std::string mReferenceFilename; - std::string mOutputFilename; - std::string mCalibrationFilename; - std::string mTimingCacheFilename; - int64_t mLabel{-1}; - int64_t mMaxBatchSize{32}; - int64_t mCalibBatchSize{0}; - int64_t mMaxNCalibBatch{0}; - int64_t mFirstCalibBatch{0}; - int64_t mUseDLACore{-1}; - nvinfer1::DataType mModelDtype{nvinfer1::DataType::kFLOAT}; - bool mTF32{true}; - Verbosity mVerbosity{static_cast(nvinfer1::ILogger::Severity::kWARNING)}; - bool mPrintLayercInfo{false}; - bool mDebugBuilder{false}; - InputDataFormat mInputDataFormat{InputDataFormat::kASCII}; - uint64_t mTopK{0}; - float mFailurePercentage{-1.0f}; - float mTolerance{0.0f}; - float mAbsTolerance{1e-5f}; - -public: - SampleConfig() - { -#ifdef ONNX_DEBUG - if (isDebug()) - { - std::cout << " SampleConfig::ctor(): " << this << "\t" << std::endl; - } -#endif - } - -protected: - ~SampleConfig() - { -#ifdef ONNX_DEBUG - if (isDebug()) - { - std::cout << "SampleConfig::dtor(): " << this << std::endl; - } -#endif - } - -public: - void setModelDtype(const nvinfer1::DataType mdt) noexcept - { - mModelDtype = mdt; - } - - nvinfer1::DataType getModelDtype() const noexcept - { - return mModelDtype; - } - - bool getTF32() const noexcept - { - return mTF32; - } - - void setTF32(bool enabled) noexcept - { - mTF32 = enabled; - } - - const char* getModelFileName() const noexcept - { - return mModelFilename.c_str(); - } - - void setModelFileName(const char* onnxFilename) noexcept - { - mModelFilename = std::string(onnxFilename); - } - Verbosity getVerbosityLevel() const noexcept - { - return mVerbosity; - } - void addVerbosity() noexcept - { - ++mVerbosity; - } - void reduceVerbosity() noexcept - { - --mVerbosity; - } - virtual void setVerbosityLevel(Verbosity v) noexcept - { - mVerbosity = v; - } - const char* getEngineFileName() const noexcept - { - return mEngineFilename.c_str(); - } - void setEngineFileName(const char* engineFilename) noexcept - { - mEngineFilename = std::string(engineFilename); - } - const char* getTextFileName() const noexcept - { - return mTextFilename.c_str(); - } - void setTextFileName(const char* textFilename) noexcept - { - mTextFilename = std::string(textFilename); - } - const char* getFullTextFileName() const noexcept - { - return mFullTextFilename.c_str(); - } - void setFullTextFileName(const char* fullTextFilename) noexcept - { - mFullTextFilename = std::string(fullTextFilename); - } - void setLabel(int64_t label) noexcept - { - mLabel = label; - } //!< set the Label - - int64_t getLabel() const noexcept - { - return mLabel; - } //!< get the Label - - bool getPrintLayerInfo() const noexcept - { - return mPrintLayercInfo; - } - - void setPrintLayerInfo(bool b) noexcept - { - mPrintLayercInfo = b; - } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() - - void setMaxBatchSize(int64_t maxBatchSize) noexcept - { - mMaxBatchSize = maxBatchSize; - } //!< set the Max Batch Size - int64_t getMaxBatchSize() const noexcept - { - return mMaxBatchSize; - } //!< get the Max Batch Size - - void setCalibBatchSize(int64_t CalibBatchSize) noexcept - { - mCalibBatchSize = CalibBatchSize; - } //!< set the calibration batch size - int64_t getCalibBatchSize() const noexcept - { - return mCalibBatchSize; - } //!< get calibration batch size - - void setMaxNCalibBatch(int64_t MaxNCalibBatch) noexcept - { - mMaxNCalibBatch = MaxNCalibBatch; - } //!< set Max Number of Calibration Batches - int64_t getMaxNCalibBatch() const noexcept - { - return mMaxNCalibBatch; - } //!< get the Max Number of Calibration Batches - - void setFirstCalibBatch(int64_t FirstCalibBatch) noexcept - { - mFirstCalibBatch = FirstCalibBatch; - } //!< set the first calibration batch - int64_t getFirstCalibBatch() const noexcept - { - return mFirstCalibBatch; - } //!< get the first calibration batch - - void setUseDLACore(int64_t UseDLACore) noexcept - { - mUseDLACore = UseDLACore; - } //!< set the DLA core to use - int64_t getUseDLACore() const noexcept - { - return mUseDLACore; - } //!< get the DLA core to use - - void setDebugBuilder() noexcept - { - mDebugBuilder = true; - } //!< enable the Debug info, while building the engine. - bool getDebugBuilder() const noexcept - { - return mDebugBuilder; - } //!< get the boolean variable, corresponding to the debug builder - - const char* getImageFileName() const noexcept //!< set Image file name (PPM or ASCII) - { - return mImageFilename.c_str(); - } - void setImageFileName(const char* imageFilename) noexcept //!< get the Image file name - { - mImageFilename = std::string(imageFilename); - } - const char* getReferenceFileName() const noexcept - { - return mReferenceFilename.c_str(); - } - void setReferenceFileName(const char* referenceFilename) noexcept //!< set reference file name - { - mReferenceFilename = std::string(referenceFilename); - } - - void setInputDataFormat(InputDataFormat idt) noexcept - { - mInputDataFormat = idt; - } //!< specifies expected data format of the image file (PPM or ASCII) - InputDataFormat getInputDataFormat() const noexcept - { - return mInputDataFormat; - } //!< returns the expected data format of the image file. - - const char* getOutputFileName() const noexcept //!< specifies the file to save the results - { - return mOutputFilename.c_str(); - } - void setOutputFileName(const char* outputFilename) noexcept //!< get the output file name - { - mOutputFilename = std::string(outputFilename); - } - - const char* getCalibrationFileName() const noexcept - { - return mCalibrationFilename.c_str(); - } //!< specifies the file containing the list of image files for int8 calibration - void setCalibrationFileName(const char* calibrationFilename) noexcept //!< get the int 8 calibration list file name - { - mCalibrationFilename = std::string(calibrationFilename); - } - - uint64_t getTopK() const noexcept - { - return mTopK; - } - void setTopK(uint64_t topK) noexcept - { - mTopK = topK; - } //!< If this options is specified, return the K top probabilities. - - float getFailurePercentage() const noexcept - { - return mFailurePercentage; - } - - void setFailurePercentage(float f) noexcept - { - mFailurePercentage = f; - } - - float getAbsoluteTolerance() const noexcept - { - return mAbsTolerance; - } - - void setAbsoluteTolerance(float a) noexcept - { - mAbsTolerance = a; - } - - float getTolerance() const noexcept - { - return mTolerance; - } - - void setTolerance(float t) noexcept - { - mTolerance = t; - } - - const char* getTimingCacheFilename() const noexcept - { - return mTimingCacheFilename.c_str(); - } - - void setTimingCacheFileName(const char* timingCacheFilename) noexcept - { - mTimingCacheFilename = std::string(timingCacheFilename); - } - - bool isDebug() const noexcept - { -#if ONNX_DEBUG - return (std::getenv("ONNX_DEBUG") ? true : false); -#else - return false; -#endif - } - - void destroy() noexcept - { - delete this; - } - -}; // class SampleConfig - -#endif diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h deleted file mode 100644 index 2053ac7c..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleDevice.h +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TRT_SAMPLE_DEVICE_H -#define TRT_SAMPLE_DEVICE_H - -#include -#include -#include -#include -#include - -namespace sample -{ - -inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) -{ - if (ret != cudaSuccess) - { - err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; - abort(); - } -} - -class TrtCudaEvent; - -namespace -{ - -void cudaSleep(void* sleep) -{ - std::this_thread::sleep_for(std::chrono::duration(*static_cast(sleep))); -} - -} // namespace - -//! -//! \class TrtCudaStream -//! \brief Managed CUDA stream -//! -class TrtCudaStream -{ -public: - TrtCudaStream() - { - cudaCheck(cudaStreamCreate(&mStream)); - } - - TrtCudaStream(const TrtCudaStream&) = delete; - - TrtCudaStream& operator=(const TrtCudaStream&) = delete; - - TrtCudaStream(TrtCudaStream&&) = delete; - - TrtCudaStream& operator=(TrtCudaStream&&) = delete; - - ~TrtCudaStream() - { - cudaCheck(cudaStreamDestroy(mStream)); - } - - cudaStream_t get() const - { - return mStream; - } - - void synchronize() - { - cudaCheck(cudaStreamSynchronize(mStream)); - } - - void wait(TrtCudaEvent& event); - - void sleep(float* ms) - { - cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms)); - } - -private: - cudaStream_t mStream{}; -}; - -//! -//! \class TrtCudaEvent -//! \brief Managed CUDA event -//! -class TrtCudaEvent -{ -public: - explicit TrtCudaEvent(bool blocking = true) - { - const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault; - cudaCheck(cudaEventCreateWithFlags(&mEvent, flags)); - } - - TrtCudaEvent(const TrtCudaEvent&) = delete; - - TrtCudaEvent& operator=(const TrtCudaEvent&) = delete; - - TrtCudaEvent(TrtCudaEvent&&) = delete; - - TrtCudaEvent& operator=(TrtCudaEvent&&) = delete; - - ~TrtCudaEvent() - { - cudaCheck(cudaEventDestroy(mEvent)); - } - - cudaEvent_t get() const - { - return mEvent; - } - - void record(const TrtCudaStream& stream) - { - cudaCheck(cudaEventRecord(mEvent, stream.get())); - } - - void synchronize() - { - cudaCheck(cudaEventSynchronize(mEvent)); - } - - // Returns time elapsed time in milliseconds - float operator-(const TrtCudaEvent& e) const - { - float time{0}; - cudaCheck(cudaEventElapsedTime(&time, e.get(), get())); - return time; - } - -private: - cudaEvent_t mEvent{}; -}; - -inline void TrtCudaStream::wait(TrtCudaEvent& event) -{ - cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0)); -} - -//! -//! \class TrtCudaGraph -//! \brief Managed CUDA graph -//! -class TrtCudaGraph -{ -public: - explicit TrtCudaGraph() = default; - - TrtCudaGraph(const TrtCudaGraph&) = delete; - - TrtCudaGraph& operator=(const TrtCudaGraph&) = delete; - - TrtCudaGraph(TrtCudaGraph&&) = delete; - - TrtCudaGraph& operator=(TrtCudaGraph&&) = delete; - - ~TrtCudaGraph() - { - if (mGraphExec) - { - cudaGraphExecDestroy(mGraphExec); - } - } - - void beginCapture(TrtCudaStream& stream) - { - cudaCheck(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); - } - - bool launch(TrtCudaStream& stream) - { - return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess; - } - - void endCapture(TrtCudaStream& stream) - { - cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph)); - cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); - cudaCheck(cudaGraphDestroy(mGraph)); - } - - void endCaptureOnError(TrtCudaStream& stream) - { - // There are two possibilities why stream capture would fail: - // (1) stream is in cudaErrorStreamCaptureInvalidated state. - // (2) TRT reports a failure. - // In case (1), the returning mGraph should be nullptr. - // In case (2), the returning mGraph is not nullptr, but it should not be used. - const auto ret = cudaStreamEndCapture(stream.get(), &mGraph); - if (ret == cudaErrorStreamCaptureInvalidated) - { - assert(mGraph == nullptr); - } - else - { - assert(ret == cudaSuccess); - assert(mGraph != nullptr); - cudaCheck(cudaGraphDestroy(mGraph)); - mGraph = nullptr; - } - // Clean up any CUDA error. - cudaGetLastError(); - sample::gLogWarning << "The CUDA graph capture on the stream has failed." << std::endl; - } - -private: - cudaGraph_t mGraph{}; - cudaGraphExec_t mGraphExec{}; -}; - -//! -//! \class TrtCudaBuffer -//! \brief Managed buffer for host and device -//! -template -class TrtCudaBuffer -{ -public: - TrtCudaBuffer() = default; - - TrtCudaBuffer(const TrtCudaBuffer&) = delete; - - TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete; - - TrtCudaBuffer(TrtCudaBuffer&& rhs) - { - reset(rhs.mPtr); - rhs.mPtr = nullptr; - } - - TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) - { - if (this != &rhs) - { - reset(rhs.mPtr); - rhs.mPtr = nullptr; - } - return *this; - } - - ~TrtCudaBuffer() - { - reset(); - } - - TrtCudaBuffer(size_t size) - { - A()(&mPtr, size); - } - - void allocate(size_t size) - { - reset(); - A()(&mPtr, size); - } - - void reset(void* ptr = nullptr) - { - if (mPtr) - { - D()(mPtr); - } - mPtr = ptr; - } - - void* get() const - { - return mPtr; - } - -private: - void* mPtr{nullptr}; -}; - -struct DeviceAllocator -{ - void operator()(void** ptr, size_t size) - { - cudaCheck(cudaMalloc(ptr, size)); - } -}; - -struct DeviceDeallocator -{ - void operator()(void* ptr) - { - cudaCheck(cudaFree(ptr)); - } -}; - -struct ManagedAllocator -{ - void operator()(void** ptr, size_t size) - { - cudaCheck(cudaMallocManaged(ptr, size)); - } -}; - -struct HostAllocator -{ - void operator()(void** ptr, size_t size) - { - cudaCheck(cudaMallocHost(ptr, size)); - } -}; - -struct HostDeallocator -{ - void operator()(void* ptr) - { - cudaCheck(cudaFreeHost(ptr)); - } -}; - -using TrtDeviceBuffer = TrtCudaBuffer; -using TrtManagedBuffer = TrtCudaBuffer; - -using TrtHostBuffer = TrtCudaBuffer; - -//! -//! \class MirroredBuffer -//! \brief Coupled host and device buffers -//! -class IMirroredBuffer -{ -public: - //! - //! Allocate memory for the mirrored buffer give the size - //! of the allocation. - //! - virtual void allocate(size_t size) = 0; - - //! - //! Get the pointer to the device side buffer. - //! - //! \return pointer to device memory or nullptr if uninitialized. - //! - virtual void* getDeviceBuffer() const = 0; - - //! - //! Get the pointer to the host side buffer. - //! - //! \return pointer to host memory or nullptr if uninitialized. - //! - virtual void* getHostBuffer() const = 0; - - //! - //! Copy the memory from host to device. - //! - virtual void hostToDevice(TrtCudaStream& stream) = 0; - - //! - //! Copy the memory from device to host. - //! - virtual void deviceToHost(TrtCudaStream& stream) = 0; - - //! - //! Interface to get the size of the memory - //! - //! \return the size of memory allocated. - //! - virtual size_t getSize() const = 0; - - //! - //! Virtual destructor declaraion - //! - virtual ~IMirroredBuffer() = default; - -}; // class IMirroredBuffer - -//! -//! Class to have a seperate memory buffer for discrete device and host allocations. -//! -class DiscreteMirroredBuffer : public IMirroredBuffer -{ -public: - void allocate(size_t size) - { - mSize = size; - mHostBuffer.allocate(size); - mDeviceBuffer.allocate(size); - } - - void* getDeviceBuffer() const - { - return mDeviceBuffer.get(); - } - - void* getHostBuffer() const - { - return mHostBuffer.get(); - } - - void hostToDevice(TrtCudaStream& stream) - { - cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); - } - - void deviceToHost(TrtCudaStream& stream) - { - cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); - } - - size_t getSize() const - { - return mSize; - } - -private: - size_t mSize{0}; - TrtHostBuffer mHostBuffer; - TrtDeviceBuffer mDeviceBuffer; -}; // class DiscreteMirroredBuffer - -//! -//! Class to have a unified memory buffer for embedded devices. -//! -class UnifiedMirroredBuffer : public IMirroredBuffer -{ -public: - void allocate(size_t size) - { - mSize = size; - mBuffer.allocate(size); - } - - void* getDeviceBuffer() const - { - return mBuffer.get(); - } - - void* getHostBuffer() const - { - return mBuffer.get(); - } - - void hostToDevice(TrtCudaStream& /*stream*/) - { - // Does nothing since we are using unified memory. - } - - void deviceToHost(TrtCudaStream& /*stream*/) - { - // Does nothing since we are using unified memory. - } - - size_t getSize() const - { - return mSize; - } - -private: - size_t mSize{0}; - TrtManagedBuffer mBuffer; -}; // class UnifiedMirroredBuffer - -inline void setCudaDevice(int device, std::ostream& os) -{ - cudaCheck(cudaSetDevice(device)); - - cudaDeviceProp properties; - cudaCheck(cudaGetDeviceProperties(&properties, device)); - -// clang-format off - os << "=== Device Information ===" << std::endl; - os << "Selected Device: " << properties.name << std::endl; - os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; - os << "SMs: " << properties.multiProcessorCount << std::endl; - os << "Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; - os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; - os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; - os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" - << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; - os << "Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; - // clang-format on -} - -} // namespace sample - -#endif // TRT_SAMPLE_DEVICE_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp deleted file mode 100644 index 8bb8a8fe..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.cpp +++ /dev/null @@ -1,1629 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "NvInfer.h" -#include "NvOnnxParser.h" - -#include "common.h" -#include "ErrorRecorder.h" -#include "half.h" -#include "logger.h" -#include "sampleEngines.h" -#include "sampleOptions.h" -#include "sampleUtils.h" - -#if !defined(_WIN32) -#include -#endif - -namespace sample -{ - -namespace -{ - -std::map readScalesFromCalibrationCache(const std::string& calibrationFile) -{ - std::map tensorScales; - std::ifstream cache{calibrationFile}; - if (!cache.is_open()) - { - sample::gLogError << "[TRT] Can not open provided calibration cache file" << std::endl; - return tensorScales; - } - std::string line; - while (std::getline(cache, line)) - { - auto colonPos = line.find_last_of(':'); - if (colonPos != std::string::npos) - { - // Scales should be stored in calibration cache as 32-bit floating numbers encoded as 32-bit integers - int32_t scalesAsInt = std::stoi(line.substr(colonPos + 2, 8), nullptr, 16); - const auto tensorName = line.substr(0, colonPos); - tensorScales[tensorName] = *reinterpret_cast(&scalesAsInt); - } - } - cache.close(); - return tensorScales; -} -} // namespace - -void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector& inputFormats, - const std::vector& outputFormats, const std::string& calibrationFile) -{ - const auto tensorScales = readScalesFromCalibrationCache(calibrationFile); - const bool broadcastInputFormats = broadcastIOFormats(inputFormats, network.getNbInputs()); - for (int32_t i = 0, n = network.getNbInputs(); i < n; ++i) - { - int32_t formatIdx = broadcastInputFormats ? 0 : i; - if (!inputFormats.empty() && inputFormats[formatIdx].first == nvinfer1::DataType::kINT8) - { - auto* input = network.getInput(i); - const auto calibScale = tensorScales.at(input->getName()); - input->setDynamicRange(-127 * calibScale, 127 * calibScale); - } - } - const bool broadcastOutputFormats = broadcastIOFormats(outputFormats, network.getNbInputs()); - for (int32_t i = 0, n = network.getNbOutputs(); i < n; ++i) - { - int32_t formatIdx = broadcastOutputFormats ? 0 : i; - if (!outputFormats.empty() && outputFormats[formatIdx].first == nvinfer1::DataType::kINT8) - { - auto* output = network.getOutput(i); - const auto calibScale = tensorScales.at(output->getName()); - output->setDynamicRange(-127 * calibScale, 127 * calibScale); - } - } -} - -#define SMP_RETVAL_IF_FALSE(condition, msg, retval, err) \ - { \ - if ((condition) == false) \ - { \ - (err) << (msg) << std::endl; \ - return retval; \ - } \ - } - -Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err) -{ - sample::gLogInfo << "Start parsing network model" << std::endl; - Parser parser; - //const std::string& modelName = model.baseModel.model; - switch (model.baseModel.format) - { - case ModelFormat::kONNX: - { - using namespace nvonnxparser; - parser.onnxParser.reset(createParser(network, sample::gLogger.getTRTLogger())); - if (!parser.onnxParser->parseFromFile( - model.baseModel.model.c_str(), static_cast(sample::gLogger.getReportableSeverity()))) - { - err << "Failed to parse onnx file" << std::endl; - parser.onnxParser.reset(); - } - break; - } - case ModelFormat::kANY: - break; - default: - break; - } - - sample::gLogInfo << "Finish parsing network model" << std::endl; - return parser; -} - -namespace -{ - -class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 -{ -public: - RndInt8Calibrator(int batches, std::vector& elemCount, const std::string& cacheFile, - const nvinfer1::INetworkDefinition& network, std::ostream& err); - - ~RndInt8Calibrator() - { - for (auto& elem : mInputDeviceBuffers) - { - cudaCheck(cudaFree(elem.second), mErr); - } - } - - bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override; - - int getBatchSize() const noexcept override - { - return 1; - } - - const void* readCalibrationCache(size_t& length) noexcept override; - - virtual void writeCalibrationCache(const void*, size_t) noexcept override {} - -private: - int mBatches{}; - int mCurrentBatch{}; - std::string mCacheFile; - std::map mInputDeviceBuffers; - std::vector mCalibrationCache; - std::ostream& mErr; -}; - -RndInt8Calibrator::RndInt8Calibrator(int batches, std::vector& elemCount, const std::string& cacheFile, - const nvinfer1::INetworkDefinition& network, std::ostream& err) - : mBatches(batches) - , mCurrentBatch(0) - , mCacheFile(cacheFile) - , mErr(err) -{ - std::ifstream tryCache(cacheFile, std::ios::binary); - if (tryCache.good()) - { - return; - } - - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0F, 1.0F); - auto gen = [&generator, &distribution]() { return distribution(generator); }; - - for (int i = 0; i < network.getNbInputs(); i++) - { - auto* input = network.getInput(i); - std::vector rnd_data(elemCount[i]); - std::generate_n(rnd_data.begin(), elemCount[i], gen); - - void* data; - cudaCheck(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr); - cudaCheck(cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), cudaMemcpyHostToDevice), mErr); - - mInputDeviceBuffers.insert(std::make_pair(input->getName(), data)); - } -} - -bool RndInt8Calibrator::getBatch(void* bindings[], const char* names[], int nbBindings) noexcept -{ - if (mCurrentBatch >= mBatches) - { - return false; - } - - for (int i = 0; i < nbBindings; ++i) - { - bindings[i] = mInputDeviceBuffers[names[i]]; - } - - ++mCurrentBatch; - - return true; -} - -const void* RndInt8Calibrator::readCalibrationCache(size_t& length) noexcept -{ - mCalibrationCache.clear(); - std::ifstream input(mCacheFile, std::ios::binary); - input >> std::noskipws; - if (input.good()) - { - std::copy( - std::istream_iterator(input), std::istream_iterator(), std::back_inserter(mCalibrationCache)); - } - - length = mCalibrationCache.size(); - return !mCalibrationCache.empty() ? mCalibrationCache.data() : nullptr; -} - -bool setTensorDynamicRange(const nvinfer1::INetworkDefinition& network, float inRange = 2.0F, float outRange = 4.0F) -{ - // Ensure that all layer inputs have a dynamic range. - for (int l = 0; l < network.getNbLayers(); l++) - { - auto* layer = network.getLayer(l); - for (int i = 0; i < layer->getNbInputs(); i++) - { - nvinfer1::ITensor* input{layer->getInput(i)}; - // Optional inputs are nullptr here and are from RNN layers. - if (input && !input->dynamicRangeIsSet()) - { - // Concat should propagate dynamic range from outputs to inputs to avoid - // Re-quantization during the concatenation - auto dynRange = (layer->getType() == nvinfer1::LayerType::kCONCATENATION) ? outRange : inRange; - if (!input->setDynamicRange(-dynRange, dynRange)) - { - return false; - } - } - } - for (int o = 0; o < layer->getNbOutputs(); o++) - { - nvinfer1::ITensor* output{layer->getOutput(o)}; - // Optional outputs are nullptr here and are from RNN layers. - if (output && !output->dynamicRangeIsSet()) - { - // Pooling must have the same input and output dynamic range. - if (layer->getType() == nvinfer1::LayerType::kPOOLING) - { - if (!output->setDynamicRange(-inRange, inRange)) - { - return false; - } - } - else - { - if (!output->setDynamicRange(-outRange, outRange)) - { - return false; - } - } - } - } - } - return true; -} - -// Walk the weights elements and overwrite (at most) 2 out of 4 elements to 0. -template -void sparsify(const T* values, int64_t count, int32_t k, int32_t rs, std::vector& sparseWeights) -{ - const auto c = count / (k * rs); - sparseWeights.resize(count * sizeof(T)); - auto* sparseValues = reinterpret_cast(sparseWeights.data()); - - constexpr int32_t window = 4; - constexpr int32_t nonzeros = 2; - - const int32_t crs = c * rs; - const auto getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * rs + rsi; }; - - for (int64_t ki = 0; ki < k; ++ki) - { - for (int64_t rsi = 0; rsi < rs; ++rsi) - { - int32_t w = 0; - int32_t nz = 0; - for (int64_t ci = 0; ci < c; ++ci) - { - const auto index = getIndex(ki, ci, rsi); - if (nz < nonzeros) - { - sparseValues[index] = values[index]; - ++nz; - } - else - { - sparseValues[index] = 0; - } - if (++w == window) - { - w = 0; - nz = 0; - } - } - } - } -} - -void sparsify(const nvinfer1::Weights& weights, int32_t k, int32_t rs, std::vector& sparseWeights) -{ - switch (weights.type) - { - case nvinfer1::DataType::kFLOAT: - sparsify(static_cast(weights.values), weights.count, k, rs, sparseWeights); - break; - case nvinfer1::DataType::kHALF: - sparsify(static_cast(weights.values), weights.count, k, rs, sparseWeights); - break; - case nvinfer1::DataType::kINT8: - case nvinfer1::DataType::kINT32: - case nvinfer1::DataType::kBOOL: break; - } -} - -template -void setSparseWeights(L& l, int32_t k, int32_t rs, std::vector& sparseWeights) -{ - auto weights = l.getKernelWeights(); - sparsify(weights, k, rs, sparseWeights); - weights.values = sparseWeights.data(); - l.setKernelWeights(weights); -} - -template -void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n) -{ - ASSERT(dst != src); - T* tdst = reinterpret_cast(dst); - T const* tsrc = reinterpret_cast(src); - for (int32_t mi = 0; mi < m; ++mi) - { - for (int32_t ni = 0; ni < n; ++ni) - { - int32_t const isrc = mi * n + ni; - int32_t const idst = ni * m + mi; - tdst[idst] = tsrc[isrc]; - } - } -} - -// Sparsify the weights of Constant layers that are fed to MatMul via Shuffle layers. -// Forward analysis on the API graph to determine which weights to sparsify. -void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) -{ - using TensorToLayer = std::unordered_map; - using LayerToTensor = std::unordered_map; - - // 1. Collect layers and tensors information from the network. - TensorToLayer matmulI2L; - TensorToLayer constO2L; - TensorToLayer shuffleI2L; - LayerToTensor shuffleL2O; - auto collectMappingInfo = [&](int32_t const idx) { - nvinfer1::ILayer* l = network.getLayer(idx); - switch (l->getType()) - { - case nvinfer1::LayerType::kMATRIX_MULTIPLY: - { - // assume weights on the second input. - matmulI2L.insert({l->getInput(1), l}); - break; - } - case nvinfer1::LayerType::kCONSTANT: - { - nvinfer1::DataType const dtype = static_cast(l)->getWeights().type; - if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF) - { - // Sparsify float only. - constO2L.insert({l->getOutput(0), l}); - } - break; - } - case nvinfer1::LayerType::kSHUFFLE: - { - shuffleI2L.insert({l->getInput(0), l}); - shuffleL2O.insert({l, l->getOutput(0)}); - break; - } - default: break; - } - }; - int32_t const nbLayers = network.getNbLayers(); - for (int32_t i = 0; i < nbLayers; ++i) - { - collectMappingInfo(i); - } - if (matmulI2L.size() == 0 || constO2L.size() == 0) - { - // No MatrixMultiply or Constant layer found, no weights to sparsify. - return; - } - - // Helper for analysis - auto isTranspose = [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); }; - auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; }; - auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool { - for (int32_t i = 0; i < dims.nbDims; ++i) - { - if (dims.d[i] != i || dims.d[i] != -1) - { - return false; - } - } - return true; - }; - auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) - { - while (shuffleI2L.find(t) != shuffleI2L.end()) - { - nvinfer1::IShuffleLayer* s = static_cast(shuffleI2L.at(t)); - if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions()) - || !isIdenticalReshape(s->getReshapeDimensions())) - { - break; - } - - if (isTranspose(s->getFirstTranspose())) - needTranspose = !needTranspose; - if (isTranspose(s->getSecondTranspose())) - needTranspose = !needTranspose; - - t = shuffleL2O.at(s); - } - return t; - }; - - // 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose - std::unordered_map constantLayerToSparse; - for (auto& o2l : constO2L) - { - // If need to transpose the weights of the Constant layer. - // Need to transpose by default due to semantic difference. - bool needTranspose{true}; - nvinfer1::ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose); - if (matmulI2L.find(t) == matmulI2L.end()) - { - continue; - } - - // check MatMul params... - nvinfer1::IMatrixMultiplyLayer* mm = static_cast(matmulI2L.at(t)); - bool const twoInputs = mm->getNbInputs() == 2; - bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions()); - bool const isSimple - = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE && mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR; - if (!(twoInputs && all2D && isSimple)) - continue; - - if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE) - needTranspose = !needTranspose; - - constantLayerToSparse.insert({static_cast(o2l.second), needTranspose}); - } - - // 3. Finally, sparsify the weights - auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose) - { - nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); - ASSERT(dims.nbDims == 2); - int32_t const idxN = needTranspose ? 1 : 0; - int32_t const n = dims.d[idxN]; - int32_t const k = dims.d[1 - idxN]; - sparseWeights.emplace_back(); - std::vector& spw = sparseWeights.back(); - nvinfer1::Weights w = layer->getWeights(); - nvinfer1::DataType const dtype = w.type; - ASSERT(dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored. - - if (needTranspose) - { - if (dtype == nvinfer1::DataType::kFLOAT) - { - spw.resize(w.count * sizeof(float)); - transpose2DWeights(spw.data(), w.values, k, n); - } - else if (dtype == nvinfer1::DataType::kHALF) - { - spw.resize(w.count * sizeof(half_float::half)); - transpose2DWeights(spw.data(), w.values, k, n); - } - - w.values = spw.data(); - std::vector tmpW; - sparsify(w, n, 1, tmpW); - - if (dtype == nvinfer1::DataType::kFLOAT) - transpose2DWeights(spw.data(), tmpW.data(), n, k); - else if (dtype == nvinfer1::DataType::kHALF) - transpose2DWeights(spw.data(), tmpW.data(), n, k); - } - else - { - sparsify(w, n, 1, spw); - } - - w.values = spw.data(); - layer->setWeights(w); - }; - for (auto& l : constantLayerToSparse) - { - sparsifyConstantWeights(l.first, l.second); - } -} - -void sparsify(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) -{ - for (int32_t l = 0; l < network.getNbLayers(); ++l) - { - auto* layer = network.getLayer(l); - const auto t = layer->getType(); - if (t == nvinfer1::LayerType::kCONVOLUTION) - { - auto& conv = *static_cast(layer); - const auto& dims = conv.getKernelSizeNd(); - if (dims.nbDims > 2) - { - continue; - } - const auto k = conv.getNbOutputMaps(); - const auto rs = dims.d[0] * dims.d[1]; - sparseWeights.emplace_back(); - setSparseWeights(conv, k, rs, sparseWeights.back()); - } - else if (t == nvinfer1::LayerType::kFULLY_CONNECTED) - { - auto& fc = *static_cast(layer); - const auto k = fc.getNbOutputChannels(); - sparseWeights.emplace_back(); - setSparseWeights(fc, k, 1, sparseWeights.back()); - } - } - - sparsifyMatMulKernelWeights(network, sparseWeights); -} - -void setLayerPrecisions(nvinfer1::INetworkDefinition& network, LayerPrecisions const& layerPrecisions) -{ - bool const hasGlobalPrecision{layerPrecisions.find("*") != layerPrecisions.end()}; - auto const globalPrecision = hasGlobalPrecision ? layerPrecisions.at("*") : nvinfer1::DataType::kFLOAT; - bool hasLayerPrecisionSkipped{false}; - for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) - { - auto* layer = network.getLayer(layerIdx); - auto const layerName = layer->getName(); - if (layerPrecisions.find(layer->getName()) != layerPrecisions.end()) - { - layer->setPrecision(layerPrecisions.at(layer->getName())); - } - else if (hasGlobalPrecision) - { - // We should not set the layer precision if its default precision is INT32 or Bool. - if (layer->getPrecision() == nvinfer1::DataType::kINT32 - || layer->getPrecision() == nvinfer1::DataType::kBOOL) - { - hasLayerPrecisionSkipped = true; - sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because the " - << " default layer precision is INT32 or Bool." << std::endl; - continue; - } - // We should not set the constant layer precision if its weights are in INT32. - if (layer->getType() == nvinfer1::LayerType::kCONSTANT - && static_cast(layer)->getWeights().type == nvinfer1::DataType::kINT32) - { - hasLayerPrecisionSkipped = true; - sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " - << "constant layer has INT32 weights." << std::endl; - continue; - } - // We should not set the layer precision if the layer operates on a shape tensor. - if (layer->getNbInputs() >= 1 && layer->getInput(0)->isShapeTensor()) - { - hasLayerPrecisionSkipped = true; - sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this layer " - << "operates on a shape tensor." << std::endl; - continue; - } - if ((layer->getType() == nvinfer1::LayerType::kIDENTITY - || layer->getType() == nvinfer1::LayerType::kSHUFFLE) - && layer->getNbInputs() >= 1 && layer->getInput(0)->getType() == nvinfer1::DataType::kINT32 - && layer->getNbOutputs() >= 1 && layer->getOutput(0)->getType() == nvinfer1::DataType::kINT32) - { - hasLayerPrecisionSkipped = true; - sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " - << "layer has INT32 input and output." << std::endl; - continue; - } - // All heuristics passed. Set the layer precision. - layer->setPrecision(globalPrecision); - } - } - - if (hasLayerPrecisionSkipped) - { - sample::gLogInfo << "Skipped setting precisions for some layers. Check verbose logs for more details." - << std::endl; - } -} - -void setLayerOutputTypes(nvinfer1::INetworkDefinition& network, LayerOutputTypes const& layerOutputTypes) -{ - bool const hasGlobalOutputType{layerOutputTypes.find("*") != layerOutputTypes.end()}; - auto const globalOutputType = hasGlobalOutputType ? layerOutputTypes.at("*").at(0) : nvinfer1::DataType::kFLOAT; - bool hasLayerOutputTypeSkipped{false}; - for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) - { - auto* layer = network.getLayer(layerIdx); - auto const layerName = layer->getName(); - auto const nbOutputs = layer->getNbOutputs(); - if (layerOutputTypes.find(layer->getName()) != layerOutputTypes.end()) - { - auto const& outputTypes = layerOutputTypes.at(layer->getName()); - bool const isBroadcast = (outputTypes.size() == 1); - if (!isBroadcast && static_cast(outputTypes.size()) != nbOutputs) - { - sample::gLogError << "Layer " << layerName << " has " << nbOutputs << " outputs but " - << outputTypes.size() << " output types are given in --layerOutputTypes flag." - << std::endl; - throw std::invalid_argument("Invalid --layerOutputTypes flag."); - } - for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) - { - layer->setOutputType(outputIdx, outputTypes.at(isBroadcast ? 0 : outputIdx)); - } - } - else if (hasGlobalOutputType) - { - // We should not set the layer output types if its default precision is INT32 or Bool. - if (layer->getPrecision() == nvinfer1::DataType::kINT32 - || layer->getPrecision() == nvinfer1::DataType::kBOOL) - { - hasLayerOutputTypeSkipped = true; - sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because the " - << " default layer precision is INT32 or Bool." << std::endl; - continue; - } - // We should not set the constant layer output types if its weights are in INT32. - if (layer->getType() == nvinfer1::LayerType::kCONSTANT - && static_cast(layer)->getWeights().type == nvinfer1::DataType::kINT32) - { - hasLayerOutputTypeSkipped = true; - sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because this " - << "constant layer has INT32 weights." << std::endl; - continue; - } - for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) - { - // We should not set the output type if the output is a shape tensor. - if (layer->getOutput(0)->isShapeTensor()) - { - hasLayerOutputTypeSkipped = true; - sample::gLogVerbose << "Skipped setting output type for output " << outputIdx << " of layer " - << layerName << " because it is a shape tensor." << std::endl; - continue; - } - layer->setOutputType(outputIdx, globalOutputType); - } - } - } - - if (hasLayerOutputTypeSkipped) - { - sample::gLogInfo << "Skipped setting output types for some layers. Check verbose logs for more details." - << std::endl; - } -} - -void setMemoryPoolLimits(nvinfer1::IBuilderConfig& config, BuildOptions const& build) -{ - auto const roundToBytes = [](double const sizeInMB) { return static_cast(sizeInMB * (1 << 20)); }; - if (build.workspace >= 0) - config.setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, roundToBytes(build.workspace)); - if (build.dlaSRAM >= 0) - config.setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM, roundToBytes(build.dlaSRAM)); - if (build.dlaLocalDRAM >= 0) - config.setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM, roundToBytes(build.dlaLocalDRAM)); - if (build.dlaGlobalDRAM >= 0) - config.setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM, roundToBytes(build.dlaGlobalDRAM)); -} - -} // namespace - -bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, - nvinfer1::INetworkDefinition& network, nvinfer1::IBuilderConfig& config, std::ostream& err, - std::vector>& sparseWeights) -{ - nvinfer1::IOptimizationProfile* profile{nullptr}; - if (build.maxBatch) - builder.setMaxBatchSize(build.maxBatch); - else - profile = builder.createOptimizationProfile(); - - bool hasDynamicShapes{false}; - - bool broadcastInputFormats = broadcastIOFormats(build.inputFormats, network.getNbInputs()); - - if (profile) - { - // Check if the provided input tensor names match the input tensors of the engine. - // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. - for (const auto& shape : build.shapes) - { - bool tensorNameFound{false}; - for (int32_t i = 0; i < network.getNbInputs(); ++i) - { - if (network.getInput(i)->getName() == shape.first) - { - tensorNameFound = true; - break; - } - } - if (!tensorNameFound) - { - sample::gLogError << "Cannot find input tensor with name \"" << shape.first << "\" in the network " - << "inputs! Please make sure the input tensor names are correct." << std::endl; - return false; - } - } - } - - for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) - { - // Set formats and data types of inputs - auto* input = network.getInput(i); - if (!build.inputFormats.empty()) - { - int inputFormatIndex = broadcastInputFormats ? 0 : i; - input->setType(build.inputFormats[inputFormatIndex].first); - input->setAllowedFormats(build.inputFormats[inputFormatIndex].second); - } - else - { - switch (input->getType()) - { - case nvinfer1::DataType::kINT32: - case nvinfer1::DataType::kBOOL: - case nvinfer1::DataType::kHALF: - // Leave these as is. - break; - case nvinfer1::DataType::kFLOAT: - case nvinfer1::DataType::kINT8: - // User did not specify a floating-point format. Default to kFLOAT. - input->setType(nvinfer1::DataType::kFLOAT); - break; - } - input->setAllowedFormats(1U << static_cast(nvinfer1::TensorFormat::kLINEAR)); - } - - if (profile) - { - auto const dims = input->getDimensions(); - auto const isScalar = dims.nbDims == 0; - auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) - || input->isShapeTensor(); - if (isDynamicInput) - { - hasDynamicShapes = true; - auto shape = build.shapes.find(input->getName()); - ShapeRange shapes{}; - - // If no shape is provided, set dynamic dimensions to 1. - if (shape == build.shapes.end()) - { - constexpr int DEFAULT_DIMENSION = 1; - std::vector staticDims; - if (input->isShapeTensor()) - { - if (isScalar) - { - staticDims.push_back(1); - } - else - { - staticDims.resize(dims.d[0]); - std::fill(staticDims.begin(), staticDims.end(), DEFAULT_DIMENSION); - } - } - else - { - staticDims.resize(dims.nbDims); - std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(), - [&](int dimension) { return dimension > 0 ? dimension : DEFAULT_DIMENSION; }); - } - sample::gLogWarning << "Dynamic dimensions required for input: " << input->getName() - << ", but no shapes were provided. Automatically overriding shape to: " - << staticDims << std::endl; - std::fill(shapes.begin(), shapes.end(), staticDims); - } - else - { - shapes = shape->second; - } - - std::vector profileDims{}; - if (input->isShapeTensor()) - { - profileDims = shapes[static_cast(nvinfer1::OptProfileSelector::kMIN)]; - SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), nvinfer1::OptProfileSelector::kMIN, - profileDims.data(), static_cast(profileDims.size())), - "Error in set shape values MIN", false, err); - profileDims = shapes[static_cast(nvinfer1::OptProfileSelector::kOPT)]; - SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), nvinfer1::OptProfileSelector::kOPT, - profileDims.data(), static_cast(profileDims.size())), - "Error in set shape values OPT", false, err); - profileDims = shapes[static_cast(nvinfer1::OptProfileSelector::kMAX)]; - SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), nvinfer1::OptProfileSelector::kMAX, - profileDims.data(), static_cast(profileDims.size())), - "Error in set shape values MAX", false, err); - } - else - { - profileDims = shapes[static_cast(nvinfer1::OptProfileSelector::kMIN)]; - SMP_RETVAL_IF_FALSE( - profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, toDims(profileDims)), - "Error in set dimensions to profile MIN", false, err); - profileDims = shapes[static_cast(nvinfer1::OptProfileSelector::kOPT)]; - SMP_RETVAL_IF_FALSE( - profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, toDims(profileDims)), - "Error in set dimensions to profile OPT", false, err); - profileDims = shapes[static_cast(nvinfer1::OptProfileSelector::kMAX)]; - SMP_RETVAL_IF_FALSE( - profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, toDims(profileDims)), - "Error in set dimensions to profile MAX", false, err); - } - } - } - } - - if (!hasDynamicShapes && !build.shapes.empty()) - { - sample::gLogError << "Static model does not take explicit shapes since the shape of inference tensors will be " - "determined by the model itself" - << std::endl; - return false; - } - - if (profile && hasDynamicShapes) - { - SMP_RETVAL_IF_FALSE(profile->isValid(), "Required optimization profile is invalid", false, err); - SMP_RETVAL_IF_FALSE(config.addOptimizationProfile(profile) != -1, "Error in add optimization profile", false, err); - } - - bool broadcastOutputFormats = broadcastIOFormats(build.outputFormats, network.getNbOutputs(), false); - - for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) - { - // Set formats and data types of outputs - auto* output = network.getOutput(i); - if (!build.outputFormats.empty()) - { - int outputFormatIndex = broadcastOutputFormats ? 0 : i; - output->setType(build.outputFormats[outputFormatIndex].first); - output->setAllowedFormats(build.outputFormats[outputFormatIndex].second); - } - else - { - output->setAllowedFormats(1U << static_cast(nvinfer1::TensorFormat::kLINEAR)); - } - } - - setMemoryPoolLimits(config, build); - - if (build.timingCacheMode == TimingCacheMode::kDISABLE) - config.setFlag(nvinfer1::BuilderFlag::kDISABLE_TIMING_CACHE); - - if (!build.tf32) - config.clearFlag(nvinfer1::BuilderFlag::kTF32); - - if (build.refittable) - config.setFlag(nvinfer1::BuilderFlag::kREFIT); - - if (build.sparsity != SparsityFlag::kDISABLE) - { - config.setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); - if (build.sparsity == SparsityFlag::kFORCE) - sparsify(network, sparseWeights); - } - - config.setProfilingVerbosity(build.profilingVerbosity); - config.setMinTimingIterations(build.minTiming); - config.setAvgTimingIterations(build.avgTiming); - - if (build.fp16) - config.setFlag(nvinfer1::BuilderFlag::kFP16); - - if (build.int8) - config.setFlag(nvinfer1::BuilderFlag::kINT8); - - if (build.int8 && !build.fp16) - { - sample::gLogInfo - << "FP32 and INT8 precisions have been specified - more performance might be enabled by additionally " - "specifying --fp16 or --best" - << std::endl; - } - - auto isInt8 = [](const IOFormat& format) { return format.first == nvinfer1::DataType::kINT8; }; - auto int8IO = std::count_if(build.inputFormats.begin(), build.inputFormats.end(), isInt8) - + std::count_if(build.outputFormats.begin(), build.outputFormats.end(), isInt8); - - auto hasQDQLayers = [](nvinfer1::INetworkDefinition& network) { - // Determine if our network has QDQ layers. - const auto nbLayers = network.getNbLayers(); - for (int32_t i = 0; i < nbLayers; i++) - { - const auto& layer = network.getLayer(i); - if (layer->getType() == nvinfer1::LayerType::kQUANTIZE || layer->getType() == nvinfer1::LayerType::kDEQUANTIZE) - return true; - } - return false; - }; - - if (!hasQDQLayers(network) && (build.int8 || int8IO) && build.calibration.empty()) - { - // Explicitly set int8 scales if no calibrator is provided and if I/O tensors use int8, - // because auto calibration does not support this case. - SMP_RETVAL_IF_FALSE(setTensorDynamicRange(network), "Error in set tensor dynamic range.", false, err); - } - else if (build.int8) - { - if (!hasQDQLayers(network) && int8IO) - { - try - { - // Set dynamic ranges of int8 inputs / outputs to match scales loaded from calibration cache - // TODO http://nvbugs/3262234 Change the network validation so that this workaround can be removed - setTensorScalesFromCalibration(network, build.inputFormats, build.outputFormats, build.calibration); - } - catch (std::exception&) - { - sample::gLogError - << "Int8IO was specified but impossible to read tensor scales from provided calibration cache file" - << std::endl; - return false; - } - } - nvinfer1::IOptimizationProfile* profileCalib{nullptr}; - if (!build.shapesCalib.empty()) - { - profileCalib = builder.createOptimizationProfile(); - for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) - { - auto* input = network.getInput(i); - nvinfer1::Dims profileDims{}; - auto shape = build.shapesCalib.find(input->getName()); - ShapeRange shapesCalib{}; - shapesCalib = shape->second; - - profileDims = toDims(shapesCalib[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - // Here we check only kMIN as all profileDims are the same. - SMP_RETVAL_IF_FALSE( - profileCalib->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, profileDims), - "Error in set dimensions to calibration profile OPT", false, err); - profileCalib->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, profileDims); - profileCalib->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, profileDims); - } - SMP_RETVAL_IF_FALSE(profileCalib->isValid(), "Calibration profile is invalid", false, err); - SMP_RETVAL_IF_FALSE(config.setCalibrationProfile(profileCalib), "Error in set calibration profile", false, err); - } - - std::vector elemCount{}; - for (int i = 0; i < network.getNbInputs(); i++) - { - auto* input = network.getInput(i); - auto const dims = input->getDimensions(); - auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); - - if (profileCalib) - elemCount.push_back(volume(profileCalib->getDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT))); - else if (profile && isDynamicInput) - elemCount.push_back(volume(profile->getDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT))); - else - elemCount.push_back(volume(input->getDimensions())); - } - - config.setInt8Calibrator(new RndInt8Calibrator(1, elemCount, build.calibration, network, err)); - } - - if (build.directIO) - config.setFlag(nvinfer1::BuilderFlag::kDIRECT_IO); - - switch (build.precisionConstraints) - { - case PrecisionConstraints::kNONE: - // It's the default for TensorRT. - break; - case PrecisionConstraints::kOBEY: - config.setFlag(nvinfer1::BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); - break; - case PrecisionConstraints::kPREFER: config.setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); break; - } - - if (!build.layerPrecisions.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) - setLayerPrecisions(network, build.layerPrecisions); - - if (!build.layerOutputTypes.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) - setLayerOutputTypes(network, build.layerOutputTypes); - - if (build.safe) - config.setEngineCapability(sys.DLACore != -1 ? nvinfer1::EngineCapability::kDLA_STANDALONE : nvinfer1::EngineCapability::kSAFETY); - - if (build.restricted) - config.setFlag(nvinfer1::BuilderFlag::kSAFETY_SCOPE); - - if (sys.DLACore != -1) - { - if (sys.DLACore < builder.getNbDLACores()) - { - config.setDefaultDeviceType(nvinfer1::DeviceType::kDLA); - config.setDLACore(sys.DLACore); - config.setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); - - if (sys.fallback) - config.setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); - else // Reformatting runs on GPU, so avoid I/O reformatting - config.setFlag(nvinfer1::BuilderFlag::kDIRECT_IO); - if (!build.int8) - config.setFlag(nvinfer1::BuilderFlag::kFP16); - } - else - { - err << "Cannot create DLA engine, " << sys.DLACore << " not available" << std::endl; - return false; - } - } - - if (build.enabledTactics || build.disabledTactics) - { - nvinfer1::TacticSources tacticSources = config.getTacticSources(); - tacticSources |= build.enabledTactics; - tacticSources &= ~build.disabledTactics; - config.setTacticSources(tacticSources); - } - - return true; -} - -//! -//! \brief Create an engine for a network defintion -//! -//! \return Pointer to the engine created or nullptr if the creation failed -//! -bool networkToEngine(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, - BuildEnvironment& env, std::ostream& err) -{ - TrtUniquePtr config{builder.createBuilderConfig()}; - std::vector> sparseWeights; - SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", false, err); - SMP_RETVAL_IF_FALSE(setupNetworkAndConfig(build, sys, builder, *env.network, *config, err, sparseWeights), - "Network And Config setup failed", false, err); - - std::unique_ptr timingCache{nullptr}; - // Try to load cache from file. Create a fresh cache if the file doesn't exist - if (build.timingCacheMode == TimingCacheMode::kGLOBAL) - { - std::vector loadedCache = loadTimingCacheFile(build.timingCacheFile); - timingCache.reset(config->createTimingCache(static_cast(loadedCache.data()), loadedCache.size())); - SMP_RETVAL_IF_FALSE(timingCache != nullptr, "TimingCache creation failed", false, err); - config->setTimingCache(*timingCache, false); - } - - // CUDA stream used for profiling by the builder. - auto profileStream = samplesCommon::makeCudaStream(); - SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", false, err); - config->setProfileStream(*profileStream); - - TrtUniquePtr serializedEngine{builder.buildSerializedNetwork(*env.network, *config)}; - SMP_RETVAL_IF_FALSE(serializedEngine != nullptr, "Engine could not be created from network", false, err); - - env.engineBlob.resize(serializedEngine->size()); - std::memcpy(env.engineBlob.data(), serializedEngine->data(), serializedEngine->size()); - - if (build.safe) - { - ASSERT(sample::hasSafeRuntime()); - std::unique_ptr safeRuntime{sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())}; - SMP_RETVAL_IF_FALSE(safeRuntime != nullptr, "SafeRuntime creation failed", false, err); - safeRuntime->setErrorRecorder(&gRecorder); - env.safeEngine.reset(safeRuntime->deserializeCudaEngine(serializedEngine->data(), serializedEngine->size())); - if (build.consistency) - checkSafeEngine(serializedEngine->data(), serializedEngine->size()); - - SMP_RETVAL_IF_FALSE(env.safeEngine != nullptr, "SafeEngine deserialization failed", false, err); - } - else - { - TrtUniquePtr runtime{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())}; - SMP_RETVAL_IF_FALSE(runtime != nullptr, "Runtime creation failed", false, err); - runtime->setErrorRecorder(&gRecorder); - env.engine.reset(runtime->deserializeCudaEngine(serializedEngine->data(), serializedEngine->size())); - SMP_RETVAL_IF_FALSE(env.engine != nullptr, "Engine deserialization failed", false, err); - if (build.timingCacheMode == TimingCacheMode::kGLOBAL) - { - auto const& timingCache = config->getTimingCache(); - std::unique_ptr timingCacheHostData{timingCache->serialize()}; - SMP_RETVAL_IF_FALSE(timingCacheHostData != nullptr, "Timing Cache serialization failed", false, err); - saveTimingCacheFile(build.timingCacheFile, timingCacheHostData.get()); - } - if (config->getInt8Calibrator()) - delete config->getInt8Calibrator(); - } - return true; -} - -//! -//! \brief Parse a given model, create a network and an engine. -//! -bool modelToBuildEnv( - const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, BuildEnvironment& env, std::ostream& err) -{ - TrtUniquePtr builder{nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())}; - SMP_RETVAL_IF_FALSE(builder != nullptr, "Builder creation failed", false, err); - builder->setErrorRecorder(&gRecorder); - auto networkFlags = (build.maxBatch) ? 0U : 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - - env.network.reset(builder->createNetworkV2(networkFlags)); - SMP_RETVAL_IF_FALSE(env.network != nullptr, "Network creation failed", false, err); - env.parser = modelToNetwork(model, *env.network, err); - SMP_RETVAL_IF_FALSE(env.parser.operator bool(), "Parsing model failed", false, err); - SMP_RETVAL_IF_FALSE(networkToEngine(build, sys, *builder, env, err), "Building engine failed", false, err); - return true; -} - -namespace -{ -std::pair, std::vector> getLayerWeightsRolePair(nvinfer1::IRefitter& refitter) -{ - // Get number of refittable items. - auto const nbAll = refitter.getAll(0, nullptr, nullptr); - std::vector layerNames(nbAll); - // Allocate buffers for the items and get them. - std::vector weightsRoles(nbAll); - refitter.getAll(nbAll, layerNames.data(), weightsRoles.data()); - std::vector layerNameStrs(nbAll); - std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { - if (name == nullptr) - return std::string{}; - - return std::string{name}; - }); - return {layerNameStrs, weightsRoles}; -} - -std::pair, std::vector> getMissingLayerWeightsRolePair(nvinfer1::IRefitter& refitter) -{ - // Get number of refittable items. - auto const nbMissing = refitter.getMissing(0, nullptr, nullptr); - std::vector layerNames(nbMissing); - // Allocate buffers for the items and get them. - std::vector weightsRoles(nbMissing); - refitter.getMissing(nbMissing, layerNames.data(), weightsRoles.data()); - std::vector layerNameStrs(nbMissing); - std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { - if (name == nullptr) - return std::string{}; - return std::string{name}; - }); - return {layerNameStrs, weightsRoles}; -} - -bool loadEngineToEnv(const std::string& engine, int DLACore, bool safe, bool enableConsistency, BuildEnvironment& env, std::ostream& err) -{ - std::ifstream engineFile(engine, std::ios::binary); - SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error opening engine file: " << engine); - engineFile.seekg(0, std::ifstream::end); - int64_t fsize = engineFile.tellg(); - engineFile.seekg(0, std::ifstream::beg); - - env.engineBlob.resize(fsize); - engineFile.read(reinterpret_cast(env.engineBlob.data()), fsize); - SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error loading engine file: " << engine); - - if (safe) - { - ASSERT(sample::hasSafeRuntime()); - std::unique_ptr safeRuntime{sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())}; - safeRuntime->setErrorRecorder(&gRecorder); - env.safeEngine.reset(safeRuntime->deserializeCudaEngine(env.engineBlob.data(), fsize)); - bool result = env.safeEngine != nullptr; - if (result && enableConsistency) - { - checkSafeEngine(env.engineBlob.data(), fsize); - } - return result; - } - - TrtUniquePtr runtime{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())}; - if (DLACore != -1) - runtime->setDLACore(DLACore); - - runtime->setErrorRecorder(&gRecorder); - env.engine.reset(runtime->deserializeCudaEngine(env.engineBlob.data(), fsize)); - return env.engine != nullptr; -} -} // namespace - -void dumpRefittable(nvinfer1::ICudaEngine& engine) -{ - TrtUniquePtr refitter{nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger())}; - if (refitter == nullptr) - { - sample::gLogError << "Failed to create a refitter." << std::endl; - return; - } - - auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); - auto const& layerNames = layerWeightsRolePair.first; - auto const& weightsRoles = layerWeightsRolePair.second; - auto const nbAll = layerWeightsRolePair.first.size(); - for (size_t i = 0; i < nbAll; ++i) - { - sample::gLogInfo << layerNames[i] << " " << weightsRoles[i] << std::endl; - } -} - -nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err) -{ - BuildEnvironment env; - return loadEngineToEnv(engine, DLACore, false, false, env, err) ? env.engine.release() : nullptr; -} - -bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err) -{ - std::ofstream engineFile(fileName, std::ios::binary); - if (!engineFile) - { - err << "Cannot open engine file: " << fileName << std::endl; - return false; - } - - TrtUniquePtr serializedEngine{engine.serialize()}; - if (serializedEngine == nullptr) - { - err << "Engine serialization failed" << std::endl; - return false; - } - - engineFile.write(static_cast(serializedEngine->data()), serializedEngine->size()); - return !engineFile.fail(); -} - -bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, - BuildEnvironment& env, std::ostream& err) -{ - TrtUniquePtr engine; - TrtUniquePtr network; - Parser parser; - - bool createEngineSuccess {false}; - - if (build.load) - createEngineSuccess = loadEngineToEnv(build.engine, sys.DLACore, build.safe, build.consistency, env, err); - else - createEngineSuccess = modelToBuildEnv(model, build, sys, env, err); - - SMP_RETVAL_IF_FALSE(createEngineSuccess, "Failed to create engine from model.", false, err); - - if (build.save) - { - std::ofstream engineFile(build.engine, std::ios::binary); - engineFile.write(reinterpret_cast(env.engineBlob.data()), env.engineBlob.size()); - SMP_RETVAL_IF_FALSE(!engineFile.fail(), "Saving engine to file failed.", false, err); - } - return true; -} - -nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, - nvinfer1::INetworkDefinition& network, std::ostream& err) -{ - TrtUniquePtr config{builder.createBuilderConfig()}; - std::vector> sparseWeights; - SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", nullptr, err); - SMP_RETVAL_IF_FALSE(setupNetworkAndConfig(build, sys, builder, network, *config, err, sparseWeights), - "Network And Config setup failed", nullptr, err); - return builder.buildSerializedNetwork(network, *config); -} - -nvinfer1::IHostMemory* modelToSerialized( - const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err) -{ - TrtUniquePtr builder{nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())}; - SMP_RETVAL_IF_FALSE(builder != nullptr, "Builder creation failed", nullptr, err); - builder->setErrorRecorder(&gRecorder); - - auto networkFlags - = (build.maxBatch) ? 0U : 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - - TrtUniquePtr network{builder->createNetworkV2(networkFlags)}; - SMP_RETVAL_IF_FALSE(network != nullptr, "Network creation failed", nullptr, err); - - Parser parser = modelToNetwork(model, *network, err); - SMP_RETVAL_IF_FALSE(parser.operator bool(), "Parsing model failed", nullptr, err); - - return networkToSerialized(build, sys, *builder, *network, err); -} - -bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err) -{ - TrtUniquePtr serialized{modelToSerialized(model, build, sys, err)}; - SMP_RETVAL_IF_FALSE(serialized != nullptr, "Network serialization failed", false, err); - - std::ofstream engineFile(build.engine, std::ios::binary); - SMP_RETVAL_IF_FALSE(!!engineFile, "Cannot open a file to save a serialize network", false, err); - engineFile.write(static_cast(serialized->data()), serialized->size()); - return !engineFile.fail(); -} - -// There is not a getWeightsName API, so we need to use WeightsRole. -std::vector> getAllRefitWeightsForLayer(const nvinfer1::ILayer& l) -{ - switch (l.getType()) - { - case nvinfer1::LayerType::kCONSTANT: - { - const auto& layer = static_cast(l); - return {std::make_pair(nvinfer1::WeightsRole::kCONSTANT, layer.getWeights())}; - } - case nvinfer1::LayerType::kCONVOLUTION: - { - const auto& layer = static_cast(l); - return {std::make_pair(nvinfer1::WeightsRole::kKERNEL, layer.getKernelWeights()), - std::make_pair(nvinfer1::WeightsRole::kBIAS, layer.getBiasWeights())}; - } - case nvinfer1::LayerType::kDECONVOLUTION: - { - const auto& layer = static_cast(l); - return {std::make_pair(nvinfer1::WeightsRole::kKERNEL, layer.getKernelWeights()), - std::make_pair(nvinfer1::WeightsRole::kBIAS, layer.getBiasWeights())}; - } - case nvinfer1::LayerType::kFULLY_CONNECTED: - { - const auto& layer = static_cast(l); - return {std::make_pair(nvinfer1::WeightsRole::kKERNEL, layer.getKernelWeights()), - std::make_pair(nvinfer1::WeightsRole::kBIAS, layer.getBiasWeights())}; - } - case nvinfer1::LayerType::kSCALE: - { - const auto& layer = static_cast(l); - return {std::make_pair(nvinfer1::WeightsRole::kSCALE, layer.getScale()), - std::make_pair(nvinfer1::WeightsRole::kSHIFT, layer.getShift())}; - } - case nvinfer1::LayerType::kRNN_V2: - case nvinfer1::LayerType::kACTIVATION: - case nvinfer1::LayerType::kPOOLING: - case nvinfer1::LayerType::kLRN: - case nvinfer1::LayerType::kSOFTMAX: - case nvinfer1::LayerType::kSHUFFLE: - case nvinfer1::LayerType::kCONCATENATION: - case nvinfer1::LayerType::kELEMENTWISE: - case nvinfer1::LayerType::kPLUGIN: - case nvinfer1::LayerType::kUNARY: - case nvinfer1::LayerType::kPADDING: - case nvinfer1::LayerType::kREDUCE: - case nvinfer1::LayerType::kTOPK: - case nvinfer1::LayerType::kGATHER: - case nvinfer1::LayerType::kMATRIX_MULTIPLY: - case nvinfer1::LayerType::kRAGGED_SOFTMAX: - case nvinfer1::LayerType::kIDENTITY: - case nvinfer1::LayerType::kPLUGIN_V2: - case nvinfer1::LayerType::kSLICE: - case nvinfer1::LayerType::kFILL: - case nvinfer1::LayerType::kSHAPE: - case nvinfer1::LayerType::kPARAMETRIC_RELU: - case nvinfer1::LayerType::kRESIZE: - case nvinfer1::LayerType::kTRIP_LIMIT: - case nvinfer1::LayerType::kRECURRENCE: - case nvinfer1::LayerType::kITERATOR: - case nvinfer1::LayerType::kLOOP_OUTPUT: - case nvinfer1::LayerType::kSELECT: - case nvinfer1::LayerType::kQUANTIZE: - case nvinfer1::LayerType::kDEQUANTIZE: - case nvinfer1::LayerType::kCONDITION: - case nvinfer1::LayerType::kCONDITIONAL_INPUT: - case nvinfer1::LayerType::kCONDITIONAL_OUTPUT: - case nvinfer1::LayerType::kSCATTER: - case nvinfer1::LayerType::kEINSUM: - case nvinfer1::LayerType::kASSERTION: return {}; - } - return {}; -} - -bool timeRefit(nvinfer1::INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, bool multiThreading) -{ - using time_point = std::chrono::time_point; - using durationMs = std::chrono::duration; - - auto const nbLayers = network.getNbLayers(); - TrtUniquePtr refitter{nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger())}; - // Set max threads that can be used by refitter. - if (multiThreading && !refitter->setMaxThreads(10)) - { - sample::gLogError << "Failed to set max threads to refitter." << std::endl; - return false; - } - auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); - // We use std::string instead of const char* since we can have copies of layer names. - std::set> layerRoleSet; - - auto const& layerNames = layerWeightsRolePair.first; - auto const& weightsRoles = layerWeightsRolePair.second; - - std::transform(layerNames.begin(), layerNames.end(), weightsRoles.begin(), - std::inserter(layerRoleSet, layerRoleSet.begin()), - [](std::string const& layerName, nvinfer1::WeightsRole const role) { return std::make_pair(layerName, role); }); - - auto const isRefittable = [&layerRoleSet](char const* layerName, nvinfer1::WeightsRole const role) { - return layerRoleSet.find(std::make_pair(layerName, role)) != layerRoleSet.end(); - }; - - auto const setWeights = [&] { - for (int32_t i = 0; i < nbLayers; i++) - { - auto const layer = network.getLayer(i); - auto const roleWeightsVec = getAllRefitWeightsForLayer(*layer); - for (auto const& roleWeights : roleWeightsVec) - { - if (isRefittable(layer->getName(), roleWeights.first)) - { - bool const success = refitter->setWeights(layer->getName(), roleWeights.first, roleWeights.second); - if (!success) - return false; - } - } - } - return true; - }; - - auto const reportMissingWeights = [&] { - auto const& missingPair = getMissingLayerWeightsRolePair(*refitter); - auto const& layerNames = missingPair.first; - auto const& weightsRoles = missingPair.second; - for (size_t i = 0; i < layerNames.size(); ++i) - { - sample::gLogError << "Missing (" << layerNames[i] << ", " << weightsRoles[i] << ") for refitting." - << std::endl; - } - return layerNames.empty(); - }; - - // Warm up and report missing weights - bool const success = setWeights() && reportMissingWeights() && refitter->refitCudaEngine(); - if (!success) - { - return false; - } - - constexpr int32_t loop = 10; - time_point const refitStartTime{std::chrono::steady_clock::now()}; - { - for (int32_t l = 0; l < loop; l++) - { - bool const success = setWeights() && refitter->refitCudaEngine(); - if (!success) - { - return false; - } - } - } - time_point const refitEndTime{std::chrono::steady_clock::now()}; - - sample::gLogInfo << "Engine refitted" - << " in " << durationMs(refitEndTime - refitStartTime).count() / loop << " ms." << std::endl; - return true; -} - -namespace -{ -void* initSafeRuntime() -{ - void* handle{nullptr}; -#if !defined(_WIN32) - std::string const dllName{samplesCommon::isDebug() ? "libnvinfer_safe_debug.so.8" : "libnvinfer_safe.so.8"}; -#if SANITIZER_BUILD - handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); -#else - handle = dlopen(dllName.c_str(), RTLD_LAZY); -#endif -#endif - return handle; -} - -void* initConsistencyCheckerLibrary() -{ - void* handle{nullptr}; -#if !defined(_WIN32) - std::string const dllName{samplesCommon::isDebug() ? "libnvinfer_checker_debug.so.8" : "libnvinfer_checker.so.8"}; -#if SANITIZER_BUILD - handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); -#else - handle = dlopen(dllName.c_str(), RTLD_LAZY); -#endif -#endif - return handle; -} - -#if !defined(_WIN32) -struct DllDeleter -{ - void operator()(void* handle) - { - if (handle != nullptr) - { - dlclose(handle); - } - } -}; -const std::unique_ptr safeRuntimeLibrary{initSafeRuntime()}; -const std::unique_ptr consistencyCheckerLibrary{initConsistencyCheckerLibrary()}; -#endif -} // namespace - -bool hasSafeRuntime() -{ - bool ret{false}; -#if !defined(_WIN32) - ret = (safeRuntimeLibrary != nullptr); -#endif - return ret; -} - -nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept -{ - nvinfer1::safe::IRuntime* runtime{nullptr}; -#if !defined(_WIN32) - constexpr char symbolName[] = "_ZN8nvinfer14safe18createInferRuntimeERNS_7ILoggerE"; - typedef nvinfer1::safe::IRuntime* (*CreateInferRuntimeFn)(nvinfer1::ILogger & logger); - if (hasSafeRuntime()) - { - auto createFn = reinterpret_cast(dlsym(safeRuntimeLibrary.get(), symbolName)); - if (createFn != nullptr) - { - runtime = createFn(logger); - } - } -#endif - return runtime; -} - -bool hasConsistencyChecker() -{ - bool ret{false}; -#if !defined(_WIN32) - ret = (consistencyCheckerLibrary != nullptr); -#endif - return ret; -} - -nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( - nvinfer1::ILogger& logger, void const* serializedEngine, int32_t const engineSize) noexcept -{ - nvinfer1::consistency::IConsistencyChecker* checker{nullptr}; - - if (serializedEngine == nullptr || engineSize == 0) - { - return checker; - } - -#if !defined(_WIN32) - constexpr char symbolName[] = "createConsistencyChecker_INTERNAL"; - typedef nvinfer1::consistency::IConsistencyChecker* (*CreateCheckerFn)( - nvinfer1::ILogger * logger, void const* data, size_t size, uint32_t version); - if (hasSafeRuntime()) - { - auto createFn = reinterpret_cast(dlsym(consistencyCheckerLibrary.get(), symbolName)); - if (createFn != nullptr) - { - checker = createFn(&logger, serializedEngine, engineSize, NV_TENSORRT_VERSION); - } - } -#endif - return checker; -} - -bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize) -{ - - if (!hasConsistencyChecker()) - { - sample::gLogError << "Cannot perform consistency check because the checker is not loaded.." << std::endl; - return false; - } - auto checker = std::unique_ptr( - createConsistencyChecker(sample::gLogger.getTRTLogger(), serializedEngine, engineSize)); - if (checker.get() == nullptr) - { - sample::gLogError << "Failed to create consistency checker." << std::endl; - return false; - } - sample::gLogInfo << "Start consistency checking." << std::endl; - if (!checker->validate()) - { - sample::gLogError << "Consistency validation failed." << std::endl; - return false; - } - sample::gLogInfo << "Consistency validation passed." << std::endl; - return true; -} -} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h deleted file mode 100644 index 620b51a1..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleEngines.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TRT_SAMPLE_ENGINES_H -#define TRT_SAMPLE_ENGINES_H - -#include -#include - -#include "NvInfer.h" - -#if (NV_TENSORRT_MAJOR > 7) - -#include "NvInferConsistency.h" -#include "NvInferSafeRuntime.h" - -#endif - -#include "NvOnnxParser.h" -#include "sampleOptions.h" -#include "sampleUtils.h" - -namespace sample -{ - -struct Parser -{ - TrtUniquePtr onnxParser; - - operator bool() const - { - return onnxParser.operator bool(); - } -}; - -struct BuildEnvironment -{ - TrtUniquePtr network; - //! Parser that creates the network. Must be declared *after* network, so that when - //! ~BuildEnvironment() executes, the parser is destroyed before the network is destroyed. - Parser parser; - TrtUniquePtr engine; - std::unique_ptr safeEngine; - std::vector engineBlob; -}; - -//! -//! \brief Generate a network definition for a given model -//! -//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid -//! parser (the returned parser converts to false if tested) -//! -//! Constant input dimensions in the model must not be changed in the corresponding -//! network definition, because its correctness may rely on the constants. -//! -//! \see Parser::operator bool() -//! -Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err); - -//! -//! \brief Set up network and config -//! -//! \return boolean Return true if network and config were successfully set -//! -bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, - nvinfer1::INetworkDefinition& network, nvinfer1::IBuilderConfig& config, std::ostream& err, - std::vector>& sparseWeights); - -//! -//! \brief Log refittable layers and weights of a refittable engine -//! -void dumpRefittable(nvinfer1::ICudaEngine& engine); - -//! -//! \brief Load a serialized engine -//! -//! \return Pointer to the engine loaded or nullptr if the operation failed -//! -nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err); - -//! -//! \brief Save an engine into a file -//! -//! \return boolean Return true if the engine was successfully saved -//! -bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err); - -//! -//! \brief Create an engine from model or serialized file, and optionally save engine -//! -//! \return Pointer to the engine created or nullptr if the creation failed -//! -bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, - BuildEnvironment& env, std::ostream& err); - -//! -//! \brief Create an engine from model or serialized file, and optionally save engine -//! -//! \return Pointer to the engine created or nullptr if the creation failed -//! -inline TrtUniquePtr getEngine( - const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err) -{ - BuildEnvironment env; - TrtUniquePtr engine; - if (getEngineBuildEnv(model, build, sys, env, err)) - { - engine.swap(env.engine); - } - return engine; -} - -//! -//! \brief Create a serialized network -//! -//! \return Pointer to a host memory for a serialized network -//! -nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, - nvinfer1::INetworkDefinition& network, std::ostream& err); - -//! -//! \brief Tranfer model to a serialized network -//! -//! \return Pointer to a host memory for a serialized network -//! -nvinfer1::IHostMemory* modelToSerialized(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); - -//! -//! \brief Serialize network and save it into a file -//! -//! \return boolean Return true if the network was successfully serialized and saved -//! -bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); - -bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading); - -//! -//! \brief Set tensor scales from a calibration table -//! -void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector& inputFormats, - const std::vector& outputFormats, const std::string& calibrationFile); - -//! -//! \brief Check if safe runtime is loaded. -//! -bool hasSafeRuntime(); - -//! -//! \brief Create a safe runtime object if the dynamic library is loaded. -//! -nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept; - -//! -//! \brief Check if consistency checker is loaded. -//! -bool hasConsistencyChecker(); - -//! -//! \brief Create a consistency checker object if the dynamic library is loaded. -//! -nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( - nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept; - -//! -//! \brief Run consistency check on serialized engine. -//! -bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize); -} // namespace sample - -#endif // TRT_SAMPLE_ENGINES_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp deleted file mode 100644 index 51f16882..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.cpp +++ /dev/null @@ -1,990 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__QNX__) -#include -#include -#endif - -#include "NvInfer.h" - -#include "ErrorRecorder.h" -#include "logger.h" -#include "sampleDevice.h" -#include "sampleEngines.h" -#include "sampleInference.h" -#include "sampleOptions.h" -#include "sampleReporting.h" -#include "sampleUtils.h" - -namespace sample -{ - -template -bool validateTensorNames( - const MapType& map, const EngineType* engine, const int32_t endBindingIndex) -{ - // Check if the provided input tensor names match the input tensors of the engine. - // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. - for (const auto& item : map) - { - bool tensorNameFound{false}; - for (int32_t b = 0; b < endBindingIndex; ++b) - { - if (engine->bindingIsInput(b) && engine->getBindingName(b) == item.first) - { - tensorNameFound = true; - break; - } - } - if (!tensorNameFound) - { - sample::gLogError << "Cannot find input tensor with name \"" << item.first << "\" in the engine bindings! " - << "Please make sure the input tensor names are correct." << std::endl; - return false; - } - } - return true; -} - -template -class FillBindingClosure -{ -private: - using InputsMap = std::unordered_map; - using BindingsVector = std::vector>; - - EngineType const* engine; - ContextType const* context; - InputsMap const& inputs; - BindingsVector& bindings; - int32_t batch; - int32_t endBindingIndex; - - void fillOneBinding(int32_t bindingIndex, int64_t vol) - { - auto const dims = getDims(bindingIndex); - auto const name = engine->getBindingName(bindingIndex); - auto const isInput = engine->bindingIsInput(bindingIndex); - auto const dataType = engine->getBindingDataType(bindingIndex); - auto const *bindingInOutStr = isInput ? "input" : "output"; - for (auto& binding : bindings) - { - const auto input = inputs.find(name); - if (isInput && input != inputs.end()) - { - sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl; - binding->addBinding(bindingIndex, name, isInput, vol, dataType, input->second); - } - else - { - sample::gLogInfo << "Using random values for " << bindingInOutStr << " " << name << std::endl; - binding->addBinding(bindingIndex, name, isInput, vol, dataType); - } - sample::gLogInfo << "Created " << bindingInOutStr <<" binding for " << name << " with dimensions " << dims << std::endl; - } - } - - bool fillAllBindings(int32_t batch, int32_t endBindingIndex) - { - if (!validateTensorNames(inputs, engine, endBindingIndex)) - { - sample::gLogError << "Invalid tensor names found in --loadInputs flag." << std::endl; - return false; - } - - for (int32_t b = 0; b < endBindingIndex; b++) - { - auto const dims = getDims(b); - auto const comps = engine->getBindingComponentsPerElement(b); - auto const strides = context->getStrides(b); - int32_t const vectorDimIndex = engine->getBindingVectorizedDim(b); - auto const vol = volume(dims, strides, vectorDimIndex, comps, batch); - fillOneBinding(b, vol); - } - return true; - } - - nvinfer1::Dims getDims(int32_t bindingIndex); - -public: - FillBindingClosure(EngineType const* _engine, ContextType const* _context, InputsMap const& _inputs, BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex) - : engine(_engine) - , context(_context) - , inputs(_inputs) - , bindings(_bindings) - , batch(_batch) - , endBindingIndex(_endBindingIndex) - { - } - - bool operator()() - { - return fillAllBindings(batch, endBindingIndex); - } -}; - -template <> -nvinfer1::Dims FillBindingClosure::getDims(int32_t bindingIndex) -{ - return context->getBindingDimensions(bindingIndex); -} - -template <> -nvinfer1::Dims FillBindingClosure::getDims(int32_t bindingIndex) -{ - return engine->getBindingDimensions(bindingIndex); -} - -bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference) -{ - int32_t device{}; - cudaCheck(cudaGetDevice(&device)); - - cudaDeviceProp properties; - cudaCheck(cudaGetDeviceProperties(&properties, device)); - // Use managed memory on integrated devices when transfers are skipped - // and when it is explicitly requested on the commandline. - bool useManagedMemory{(inference.skipTransfers && properties.integrated) || inference.useManaged}; - using FillSafeBindings = FillBindingClosure; - if (iEnv.safe) - { - ASSERT(sample::hasSafeRuntime()); - auto* safeEngine = iEnv.safeEngine.get(); - for (int32_t s = 0; s < inference.streams; ++s) - { - iEnv.safeContext.emplace_back(safeEngine->createExecutionContext()); - iEnv.bindings.emplace_back(new Bindings(useManagedMemory)); - } - const int32_t nBindings = safeEngine->getNbBindings(); - auto const* safeContext = iEnv.safeContext.front().get(); - // batch is set to 1 because safety only support explicit batch. - return FillSafeBindings(iEnv.safeEngine.get(), safeContext, inference.inputs, iEnv.bindings, 1, nBindings)(); - } - - using FillStdBindings = FillBindingClosure; - - for (int32_t s = 0; s < inference.streams; ++s) - { - auto ec = iEnv.engine->createExecutionContext(); - if (ec == nullptr) - { - sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl; - return false; - } - iEnv.context.emplace_back(ec); - iEnv.bindings.emplace_back(new Bindings(useManagedMemory)); - } - if (iEnv.profiler) - { - iEnv.context.front()->setProfiler(iEnv.profiler.get()); - // Always run reportToProfiler() after enqueue launch - iEnv.context.front()->setEnqueueEmitsProfile(false); - } - - const int32_t nOptProfiles = iEnv.engine->getNbOptimizationProfiles(); - const int32_t nBindings = iEnv.engine->getNbBindings(); - const int32_t bindingsInProfile = nOptProfiles > 0 ? nBindings / nOptProfiles : 0; - const int32_t endBindingIndex = bindingsInProfile ? bindingsInProfile : iEnv.engine->getNbBindings(); - - if (nOptProfiles > 1) - { - sample::gLogWarning << "Multiple profiles are currently not supported. Running with one profile." << std::endl; - } - - // Make sure that the tensor names provided in command-line args actually exist in any of the engine bindings - // to avoid silent typos. - if (!validateTensorNames(inference.shapes, iEnv.engine.get(), endBindingIndex)) - { - sample::gLogError << "Invalid tensor names found in --shapes flag." << std::endl; - return false; - } - - // Set all input dimensions before all bindings can be allocated - for (int32_t b = 0; b < endBindingIndex; ++b) - { - if (iEnv.engine->bindingIsInput(b)) - { - auto dims = iEnv.context.front()->getBindingDimensions(b); - const bool isScalar = dims.nbDims == 0; - const bool isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) - || iEnv.engine->isShapeBinding(b); - if (isDynamicInput) - { - auto shape = inference.shapes.find(iEnv.engine->getBindingName(b)); - - std::vector staticDims; - if (shape == inference.shapes.end()) - { - // If no shape is provided, set dynamic dimensions to 1. - constexpr int32_t DEFAULT_DIMENSION = 1; - if (iEnv.engine->isShapeBinding(b)) - { - if (isScalar) - { - staticDims.push_back(1); - } - else - { - staticDims.resize(dims.d[0]); - std::fill(staticDims.begin(), staticDims.end(), DEFAULT_DIMENSION); - } - } - else - { - staticDims.resize(dims.nbDims); - std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(), - [&](int32_t dimension) { return dimension >= 0 ? dimension : DEFAULT_DIMENSION; }); - } - sample::gLogWarning << "Dynamic dimensions required for input: " << iEnv.engine->getBindingName(b) - << ", but no shapes were provided. Automatically overriding shape to: " - << staticDims << std::endl; - } - else if (inference.inputs.count(shape->first) && iEnv.engine->isShapeBinding(b)) - { - if (isScalar || dims.nbDims == 1) - { - // Load shape tensor from file. - size_t const size = isScalar ? 1 : dims.d[0]; - staticDims.resize(size); - auto const& filename = inference.inputs.at(shape->first); - auto dst = reinterpret_cast(staticDims.data()); - loadFromFile(filename, dst, size * sizeof(decltype(staticDims)::value_type)); - } - else - { - sample::gLogWarning << "Cannot load shape tensor " << shape->first << " from file, " - << "ND-Shape isn't supported yet" << std::endl; - // Fallback - staticDims = shape->second; - } - } - else - { - staticDims = shape->second; - } - - for (auto& c : iEnv.context) - { - if (iEnv.engine->isShapeBinding(b)) - { - if (!c->setInputShapeBinding(b, staticDims.data())) - { - return false; - } - } - else - { - if (!c->setBindingDimensions(b, toDims(staticDims))) - { - return false; - } - } - } - } - } - } - - auto* engine = iEnv.engine.get(); - auto const* context = iEnv.context.front().get(); - int32_t const batch = engine->hasImplicitBatchDimension() ? inference.batch : 1; - return FillStdBindings(engine, context, inference.inputs, iEnv.bindings, batch, endBindingIndex)(); -} - -namespace -{ - -#if defined(__QNX__) -using TimePoint = double; -#else -using TimePoint = std::chrono::time_point; -#endif - -TimePoint getCurrentTime() -{ -#if defined(__QNX__) - uint64_t const currentCycles = ClockCycles(); - uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec; - // Return current timestamp in ms. - return static_cast(currentCycles) * 1000. / cyclesPerSecond; -#else - return std::chrono::high_resolution_clock::now(); -#endif -} - -//! -//! \struct SyncStruct -//! \brief Threads synchronization structure -//! -struct SyncStruct -{ - std::mutex mutex; - TrtCudaStream mainStream; - TrtCudaEvent gpuStart{cudaEventBlockingSync}; - TimePoint cpuStart{}; - float sleep{}; -}; - -struct Enqueue -{ - explicit Enqueue(nvinfer1::IExecutionContext& context, void** buffers) - : mContext(context) - , mBuffers(buffers) - { - } - - nvinfer1::IExecutionContext& mContext; - void** mBuffers{}; -}; - -//! -//! \class EnqueueImplicit -//! \brief Functor to enqueue inference with implict batch -//! -class EnqueueImplicit : private Enqueue -{ - -public: - explicit EnqueueImplicit(nvinfer1::IExecutionContext& context, void** buffers, int32_t batch) - : Enqueue(context, buffers) - , mBatch(batch) - { - } - - bool operator()(TrtCudaStream& stream) const - { - if (mContext.enqueue(mBatch, mBuffers, stream.get(), nullptr)) - { - // Collecting layer timing info from current profile index of execution context - if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) - { - gLogWarning << "Failed to collect layer timing info from previous enqueue()" << std::endl; - } - return true; - } - return false; - } - -private: - int32_t mBatch; -}; - -//! -//! \class EnqueueExplicit -//! \brief Functor to enqueue inference with explict batch -//! -class EnqueueExplicit : private Enqueue -{ - -public: - explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, void** buffers) - : Enqueue(context, buffers) - { - } - - bool operator()(TrtCudaStream& stream) const - { - if (mContext.enqueueV2(mBuffers, stream.get(), nullptr)) - { - // Collecting layer timing info from current profile index of execution context - if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) - { - gLogWarning << "Failed to collect layer timing info from previous enqueueV2()" << std::endl; - } - return true; - } - return false; - } -}; - -//! -//! \class EnqueueGraph -//! \brief Functor to enqueue inference from CUDA Graph -//! -class EnqueueGraph -{ - -public: - explicit EnqueueGraph(nvinfer1::IExecutionContext& context, TrtCudaGraph& graph) - : mGraph(graph) - , mContext(context) - { - } - - bool operator()(TrtCudaStream& stream) const - { - if (mGraph.launch(stream)) - { - // Collecting layer timing info from current profile index of execution context - if (mContext.getProfiler() && !mContext.reportToProfiler()) - { - gLogWarning << "Failed to collect layer timing info from previous CUDA graph launch" << std::endl; - } - return true; - } - return false; - } - - TrtCudaGraph& mGraph; - nvinfer1::IExecutionContext& mContext; -}; - -//! -//! \class EnqueueSafe -//! \brief Functor to enqueue safe execution context -//! -class EnqueueSafe -{ -public: - explicit EnqueueSafe(nvinfer1::safe::IExecutionContext& context, void** buffers) - : mContext(context) - , mBuffers(buffers) - { - } - - bool operator()(TrtCudaStream& stream) const - { - if (mContext.enqueueV2(mBuffers, stream.get(), nullptr)) - { - return true; - } - return false; - } - - nvinfer1::safe::IExecutionContext& mContext; - void** mBuffers{}; -}; - -using EnqueueFunction = std::function; - -enum class StreamType : int32_t -{ - kINPUT = 0, - kCOMPUTE = 1, - kOUTPUT = 2, - kNUM = 3 -}; - -enum class EventType : int32_t -{ - kINPUT_S = 0, - kINPUT_E = 1, - kCOMPUTE_S = 2, - kCOMPUTE_E = 3, - kOUTPUT_S = 4, - kOUTPUT_E = 5, - kNUM = 6 -}; - -using MultiStream = std::array(StreamType::kNUM)>; - -using MultiEvent = std::array, static_cast(EventType::kNUM)>; - -using EnqueueTimes = std::array; - -//! -//! \class Iteration -//! \brief Inference iteration and streams management -//! -template -class Iteration -{ - -public: - Iteration(int32_t id, const InferenceOptions& inference, ContextType& context, Bindings& bindings) - : mBindings(bindings) - , mStreamId(id) - , mDepth(1 + inference.overlap) - , mActive(mDepth) - , mEvents(mDepth) - , mEnqueueTimes(mDepth) - , mContext(&context) - { - for (int32_t d = 0; d < mDepth; ++d) - { - for (int32_t e = 0; e < static_cast(EventType::kNUM); ++e) - { - mEvents[d][e].reset(new TrtCudaEvent(!inference.spin)); - } - } - createEnqueueFunction(inference, context, bindings); - } - - bool query(bool skipTransfers) - { - if (mActive[mNext]) - { - return true; - } - - if (!skipTransfers) - { - record(EventType::kINPUT_S, StreamType::kINPUT); - mBindings.transferInputToDevice(getStream(StreamType::kINPUT)); - record(EventType::kINPUT_E, StreamType::kINPUT); - wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute - } - - record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE); - recordEnqueueTime(); - if (!mEnqueue(getStream(StreamType::kCOMPUTE))) - { - return false; - } - recordEnqueueTime(); - record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE); - - if (!skipTransfers) - { - wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA - record(EventType::kOUTPUT_S, StreamType::kOUTPUT); - mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT)); - record(EventType::kOUTPUT_E, StreamType::kOUTPUT); - } - - mActive[mNext] = true; - moveNext(); - return true; - } - - float sync( - const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, std::vector& trace, bool skipTransfers) - { - if (mActive[mNext]) - { - if (skipTransfers) - { - getEvent(EventType::kCOMPUTE_E).synchronize(); - } - else - { - getEvent(EventType::kOUTPUT_E).synchronize(); - } - trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers)); - mActive[mNext] = false; - return getEvent(EventType::kCOMPUTE_S) - gpuStart; - } - return 0; - } - - void syncAll( - const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, std::vector& trace, bool skipTransfers) - { - for (int32_t d = 0; d < mDepth; ++d) - { - sync(cpuStart, gpuStart, trace, skipTransfers); - moveNext(); - } - } - - void wait(TrtCudaEvent& gpuStart) - { - getStream(StreamType::kINPUT).wait(gpuStart); - } - - void setInputData() - { - mBindings.transferInputToDevice(getStream(StreamType::kINPUT)); - } - - void fetchOutputData() - { - mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT)); - } - -private: - void moveNext() - { - mNext = mDepth - 1 - mNext; - } - - TrtCudaStream& getStream(StreamType t) - { - return mStream[static_cast(t)]; - } - - TrtCudaEvent& getEvent(EventType t) - { - return *mEvents[mNext][static_cast(t)]; - } - - void record(EventType e, StreamType s) - { - getEvent(e).record(getStream(s)); - } - - void recordEnqueueTime() - { - mEnqueueTimes[mNext][enqueueStart] = getCurrentTime(); - enqueueStart = 1 - enqueueStart; - } - - TimePoint getEnqueueTime(bool start) - { - return mEnqueueTimes[mNext][start ? 0 : 1]; - } - - void wait(EventType e, StreamType s) - { - getStream(s).wait(getEvent(e)); - } - - InferenceTrace getTrace(const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, bool skipTransfers) - { - float is = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart; - float ie = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart; - float os = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart; - float oe = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart; - - return InferenceTrace(mStreamId, - std::chrono::duration(getEnqueueTime(true) - cpuStart).count(), - std::chrono::duration(getEnqueueTime(false) - cpuStart).count(), is, ie, - getEvent(EventType::kCOMPUTE_S) - gpuStart, getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe); - } - - void createEnqueueFunction(const InferenceOptions& inference, nvinfer1::IExecutionContext& context, Bindings& /*bindings*/) - { - if (inference.batch) - mEnqueue = EnqueueFunction(EnqueueImplicit(context, mBindings.getDeviceBuffers(), inference.batch)); - else - mEnqueue = EnqueueFunction(EnqueueExplicit(context, mBindings.getDeviceBuffers())); - - if (inference.graph) - { - TrtCudaStream& stream = getStream(StreamType::kCOMPUTE); - // Avoid capturing initialization calls by executing the enqueue function at least once before starting CUDA graph capture. - const auto ret = mEnqueue(stream); - assert(ret); - stream.synchronize(); - - mGraph.beginCapture(stream); - // The built TRT engine may contain operations that are not permitted under CUDA graph capture mode. - // When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails. - if (mEnqueue(stream)) - { - mGraph.endCapture(stream); - mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph)); - } - else - { - mGraph.endCaptureOnError(stream); - // Ensure any CUDA error has been cleaned up. - cudaCheck(cudaGetLastError()); - sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under " - "CUDA graph capture mode." - << std::endl; - sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be " - "launched without using CUDA graph launch." - << std::endl; - } - } - } - - void createEnqueueFunction(const InferenceOptions&, nvinfer1::safe::IExecutionContext& context, Bindings&) - { - mEnqueue = EnqueueFunction(EnqueueSafe(context, mBindings.getDeviceBuffers())); - } - - Bindings& mBindings; - - TrtCudaGraph mGraph; - EnqueueFunction mEnqueue; - - int32_t mStreamId{0}; - int32_t mNext{0}; - int32_t mDepth{2}; // default to double buffer to hide DMA transfers - - std::vector mActive; - MultiStream mStream; - std::vector mEvents; - - int32_t enqueueStart{0}; - std::vector mEnqueueTimes; - ContextType* mContext{nullptr}; -}; - -template -bool inferenceLoop(std::vector>>& iStreams, const TimePoint& cpuStart, - const TrtCudaEvent& gpuStart, int iterations, float maxDurationMs, float warmupMs, - std::vector& trace, bool skipTransfers, float idleMs) -{ - float durationMs = 0; - int32_t skip = 0; - - for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i) - { - for (auto& s : iStreams) - { - if (!s->query(skipTransfers)) - return false; - } - for (auto& s : iStreams) - { - durationMs = std::max(durationMs, s->sync(cpuStart, gpuStart, trace, skipTransfers)); - } - if (durationMs < warmupMs) // Warming up - { - if (durationMs) // Skip complete iterations - ++skip; - - continue; - } - if (idleMs != 0.F) - std::this_thread::sleep_for(std::chrono::duration(idleMs)); - } - for (auto& s : iStreams) - { - s->syncAll(cpuStart, gpuStart, trace, skipTransfers); - } - return true; -} - -template -void inferenceExecution(const InferenceOptions& inference, InferenceEnvironment& iEnv, SyncStruct& sync, - const int32_t threadIdx, const int32_t streamsPerThread, int32_t device, std::vector& trace) -{ - float warmupMs = inference.warmup; - float durationMs = inference.duration * 1000.F + warmupMs; - - cudaCheck(cudaSetDevice(device)); - - std::vector>> iStreams; - - for (int32_t s = 0; s < streamsPerThread; ++s) - { - const int32_t streamId{threadIdx * streamsPerThread + s}; - auto* iteration = new Iteration( - streamId, inference, *iEnv.template getContext(streamId), *iEnv.bindings[streamId]); - if (inference.skipTransfers) - { - iteration->setInputData(); - } - iStreams.emplace_back(iteration); - } - - for (auto& s : iStreams) - { - s->wait(sync.gpuStart); - } - - std::vector localTrace; - if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs, localTrace, - inference.skipTransfers, inference.idle)) - { - iEnv.error = true; - } - - if (inference.skipTransfers) - { - for (auto& s : iStreams) - { - s->fetchOutputData(); - } - } - - sync.mutex.lock(); - trace.insert(trace.end(), localTrace.begin(), localTrace.end()); - sync.mutex.unlock(); -} - -inline std::thread makeThread(const InferenceOptions& inference, InferenceEnvironment& iEnv, SyncStruct& sync, - int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector& trace) -{ - - if (iEnv.safe) - { - ASSERT(sample::hasSafeRuntime()); - return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), - std::ref(sync), threadIdx, streamsPerThread, device, std::ref(trace)); - } - - return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), - std::ref(sync), threadIdx, streamsPerThread, device, std::ref(trace)); -} - -} // namespace - -bool runInference( - const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace) -{ - cudaCheck(cudaProfilerStart()); - - trace.resize(0); - - SyncStruct sync; - sync.sleep = inference.sleep; - sync.mainStream.sleep(&sync.sleep); - sync.cpuStart = getCurrentTime(); - sync.gpuStart.record(sync.mainStream); - - // When multiple streams are used, trtexec can run inference in two modes: - // (1) if inference.threads is true, then run each stream on each thread. - // (2) if inference.threads is false, then run all streams on the same thread. - const int32_t numThreads = inference.threads ? inference.streams : 1; - const int32_t streamsPerThread = inference.threads ? 1 : inference.streams; - - std::vector threads; - for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx) - { - threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx, streamsPerThread, device, trace)); - } - for (auto& th : threads) - { - th.join(); - } - - cudaCheck(cudaProfilerStop()); - - auto cmpTrace = [](const InferenceTrace& a, const InferenceTrace& b) { return a.h2dStart < b.h2dStart; }; - std::sort(trace.begin(), trace.end(), cmpTrace); - - return !iEnv.error; -} - -namespace -{ -size_t reportGpuMemory() -{ - static size_t prevFree{0}; - size_t free{0}; - size_t total{0}; - size_t newlyAllocated{0}; - cudaCheck(cudaMemGetInfo(&free, &total)); - sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB"; - if (prevFree != 0) - { - newlyAllocated = (prevFree - free); - sample::gLogInfo << ", newly allocated GPU memory = " << newlyAllocated / 1024.0_MiB << " GiB"; - } - sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB" << std::endl; - prevFree = free; - return newlyAllocated; -} -} // namespace - -//! Returns true if deserialization is slower than expected or fails. -bool timeDeserialize(InferenceEnvironment& iEnv) -{ - constexpr int32_t kNB_ITERS{20}; - std::unique_ptr rt{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())}; - std::unique_ptr engine; - - std::unique_ptr safeRT{sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())}; - std::unique_ptr safeEngine; - - if (iEnv.safe) - { - ASSERT(sample::hasSafeRuntime() && safeRT != nullptr); - safeRT->setErrorRecorder(&gRecorder); - } - - auto timeDeserializeFn = [&]() -> float { - bool deserializeOK{false}; - engine.reset(nullptr); - safeEngine.reset(nullptr); - auto startClock = std::chrono::high_resolution_clock::now(); - if (iEnv.safe) - { - safeEngine.reset(safeRT->deserializeCudaEngine(iEnv.engineBlob.data(), iEnv.engineBlob.size())); - deserializeOK = (safeEngine != nullptr); - } - else - { - engine.reset(rt->deserializeCudaEngine(iEnv.engineBlob.data(), iEnv.engineBlob.size())); - deserializeOK = (engine != nullptr); - } - auto endClock = std::chrono::high_resolution_clock::now(); - // return NAN if deserialization failed. - return deserializeOK ? std::chrono::duration(endClock - startClock).count() : NAN; - }; - - // Warmup the caches to make sure that cache thrashing isn't throwing off the results - { - sample::gLogInfo << "Begin deserialization warmup..." << std::endl; - for (int32_t i = 0, e = 2; i < e; ++i) - { - timeDeserializeFn(); - } - } - sample::gLogInfo << "Begin deserialization engine timing..." << std::endl; - float const first = timeDeserializeFn(); - - // Check if first deserialization suceeded. - if (std::isnan(first)) - { - sample::gLogError << "Engine deserialization failed." << std::endl; - return true; - } - - sample::gLogInfo << "First deserialization time = " << first << " milliseconds" << std::endl; - - // Record initial gpu memory state. - reportGpuMemory(); - - float totalTime{0.F}; - for (int32_t i = 0; i < kNB_ITERS; ++i) - { - totalTime += timeDeserializeFn(); - } - const auto averageTime = totalTime / kNB_ITERS; - // reportGpuMemory sometimes reports zero after a single deserialization of a small engine, - // so use the size of memory for all the iterations. - const auto totalEngineSizeGpu = reportGpuMemory(); - sample::gLogInfo << "Total deserialization time = " << totalTime << " milliseconds in " << kNB_ITERS - << " iterations, average time = " << averageTime << " milliseconds, first time = " << first - << " milliseconds." << std::endl; - sample::gLogInfo << "Deserialization Bandwidth = " << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s" << std::endl; - - // If the first deserialization is more than tolerance slower than - // the average deserialization, return true, which means an error occurred. - // The tolerance is set to 2x since the deserialization time is quick and susceptible - // to caching issues causing problems in the first timing. - const auto tolerance = 2.0F; - const bool isSlowerThanExpected = first > averageTime * tolerance; - if (isSlowerThanExpected) - { - sample::gLogInfo << "First deserialization time divided by average time is " << (first / averageTime) - << ". Exceeds tolerance of " << tolerance << "x." << std::endl; - } - return isSlowerThanExpected; -} - -std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format) -{ - auto runtime = std::unique_ptr(nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())); - auto inspector = std::unique_ptr(iEnv.engine->createEngineInspector()); - if (!iEnv.context.empty()) - { - inspector->setExecutionContext(iEnv.context.front().get()); - } - std::string result = inspector->getEngineInformation(format); - return result; -} - -} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h deleted file mode 100644 index 1c21f592..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleInference.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TRT_SAMPLE_INFERENCE_H -#define TRT_SAMPLE_INFERENCE_H - -#include "sampleReporting.h" -#include "sampleUtils.h" - -#include -#include -#include -#include - -#include "NvInfer.h" - -#if (NV_TENSORRT_MAJOR > 7) - -#include "NvInferSafeRuntime.h" - -namespace sample -{ - -struct InferenceEnvironment -{ - TrtUniquePtr engine; - std::unique_ptr profiler; - std::vector> context; - std::vector> bindings; - bool error{false}; - - std::vector engineBlob; - - bool safe{false}; - std::unique_ptr safeEngine; - std::vector> safeContext; - - template - inline ContextType* getContext(int32_t streamIdx); -}; - -template <> -inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) -{ - return context[streamIdx].get(); -} - -template <> -inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) -{ - return safeContext[streamIdx].get(); -} - -//! -//! \brief Set up contexts and bindings for inference -//! -bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference); - -//! -//! \brief Deserialize the engine and time how long it takes. -//! -bool timeDeserialize(InferenceEnvironment& iEnv); - -//! -//! \brief Run inference and collect timing, return false if any error hit during inference -//! -bool runInference( - const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace); - -//! -//! \brief Get layer information of the engine. -//! -std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format); - -} // namespace sample - -#endif - -#endif // TRT_SAMPLE_INFERENCE_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp deleted file mode 100644 index 0afd163f..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.cpp +++ /dev/null @@ -1,1778 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "NvInfer.h" - -#include "logger.h" -#include "sampleOptions.h" - -namespace sample -{ - -namespace -{ - -std::vector splitToStringVec(const std::string& option, char separator) -{ - std::vector options; - - for (size_t start = 0; start < option.length();) - { - size_t separatorIndex = option.find(separator, start); - if (separatorIndex == std::string::npos) - { - separatorIndex = option.length(); - } - options.emplace_back(option.substr(start, separatorIndex - start)); - start = separatorIndex + 1; - } - - return options; -} - -template -T stringToValue(const std::string& option) -{ - return T{option}; -} - -template <> -int32_t stringToValue(const std::string& option) -{ - return std::stoi(option); -} - -template <> -float stringToValue(const std::string& option) -{ - return std::stof(option); -} - -template <> -double stringToValue(const std::string& option) -{ - return std::stod(option); -} - -template <> -bool stringToValue(const std::string& option) -{ - return true; -} - -template <> -std::vector stringToValue>(const std::string& option) -{ - std::vector shape; - std::vector dimsStrings = splitToStringVec(option, 'x'); - for (const auto& d : dimsStrings) - { - shape.push_back(stringToValue(d)); - } - return shape; -} - -template <> -nvinfer1::DataType stringToValue(const std::string& option) -{ - const std::unordered_map strToDT{{"fp32", nvinfer1::DataType::kFLOAT}, - {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8}, - {"int32", nvinfer1::DataType::kINT32}}; - const auto& dt = strToDT.find(option); - if (dt == strToDT.end()) - { - throw std::invalid_argument("Invalid DataType " + option); - } - return dt->second; -} - -template <> -nvinfer1::TensorFormats stringToValue(const std::string& option) -{ - std::vector optionStrings = splitToStringVec(option, '+'); - const std::unordered_map strToFmt{{"chw", nvinfer1::TensorFormat::kLINEAR}, - {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4}, - {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16}, - {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8}, - {"hwc", nvinfer1::TensorFormat::kHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, - {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}}; - nvinfer1::TensorFormats formats{}; - for (auto f : optionStrings) - { - const auto& tf = strToFmt.find(f); - if (tf == strToFmt.end()) - { - throw std::invalid_argument(std::string("Invalid TensorFormat ") + f); - } - formats |= 1U << static_cast(tf->second); - } - - return formats; -} - -template <> -IOFormat stringToValue(const std::string& option) -{ - IOFormat ioFormat{}; - const size_t colon = option.find(':'); - - if (colon == std::string::npos) - { - throw std::invalid_argument(std::string("Invalid IOFormat ") + option); - } - - ioFormat.first = stringToValue(option.substr(0, colon)); - ioFormat.second = stringToValue(option.substr(colon + 1)); - - return ioFormat; -} - -template -std::pair splitNameAndValue(const std::string& s) -{ - std::string tensorName; - std::string valueString; - // Split on the last : - std::vector nameRange{splitToStringVec(s, ':')}; - // Everything before the last : is the name - tensorName = nameRange[0]; - for (size_t i = 1; i < nameRange.size() - 1; i++) - { - tensorName += ":" + nameRange[i]; - } - // Value is the string element after the last : - valueString = nameRange[nameRange.size() - 1]; - return std::pair(tensorName, stringToValue(valueString)); -} - -template -void splitInsertKeyValue(const std::vector& kvList, T& map) -{ - for (const auto& kv : kvList) - { - map.insert(splitNameAndValue(kv)); - } -} - -const char* boolToEnabled(bool enable) -{ - return enable ? "Enabled" : "Disabled"; -} - -//! Check if input option exists in input arguments. -//! If it does: return its value, erase the argument and return true. -//! If it does not: return false. -template -bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) -{ - const auto match = arguments.find(option); - if (match != arguments.end()) - { - value = stringToValue(match->second); - arguments.erase(match); - return true; - } - - return false; -} - -//! Check if input option exists in input arguments. -//! If it does: return false in value, erase the argument and return true. -//! If it does not: return false. -bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value) -{ - bool dummy; - if (getAndDelOption(arguments, option, dummy)) - { - value = false; - return true; - } - return false; -} - -//! Check if input option exists in input arguments. -//! If it does: add all the matched arg values to values vector, erase the argument and return true. -//! If it does not: return false. -template -bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, std::vector& values) -{ - const auto match = arguments.equal_range(option); - if (match.first == match.second) - { - return false; - } - - auto addToValues = [&values](Arguments::value_type& argValue) {values.emplace_back(stringToValue(argValue.second));}; - std::for_each(match.first, match.second, addToValues); - arguments.erase(match.first, match.second); - - return true; -} - -void insertShapesBuild(std::unordered_map& shapes, nvinfer1::OptProfileSelector selector, const std::string& name, const std::vector& dims) -{ - shapes[name][static_cast(selector)] = dims; -} - -void insertShapesInference(std::unordered_map>& shapes, const std::string& name, const std::vector& dims) -{ - shapes[name] = dims; -} - -std::string removeSingleQuotationMarks(std::string& str) -{ - std::vector strList{splitToStringVec(str, '\'')}; - // Remove all the escaped single quotation marks - std::string retVal = ""; - // Do not really care about unterminated sequences - for (size_t i = 0; i < strList.size(); i++) - { - retVal += strList[i]; - } - return retVal; -} - -void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions) -{ - std::string list; - if (!getAndDelOption(arguments, argument, list)) - { - return; - } - - // The layerPrecisions flag contains comma-separated layerName:precision pairs. - std::vector precisionList{splitToStringVec(list, ',')}; - for (auto const& s : precisionList) - { - auto namePrecisionPair = splitNameAndValue(s); - auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); - layerPrecisions[layerName] = namePrecisionPair.second; - } -} - -void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutputTypes& layerOutputTypes) -{ - std::string list; - if (!getAndDelOption(arguments, argument, list)) - { - return; - } - - // The layerOutputTypes flag contains comma-separated layerName:types pairs. - std::vector precisionList{splitToStringVec(list, ',')}; - for (auto const& s : precisionList) - { - auto namePrecisionPair = splitNameAndValue(s); - auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); - auto const typeStrings = splitToStringVec(namePrecisionPair.second, '+'); - std::vector typeVec(typeStrings.size(), nvinfer1::DataType::kFLOAT); - std::transform(typeStrings.begin(), typeStrings.end(), typeVec.begin(), stringToValue); - layerOutputTypes[layerName] = typeVec; - } -} - -bool getShapesBuild(Arguments& arguments, std::unordered_map& shapes, char const* argument, - nvinfer1::OptProfileSelector selector) -{ - std::string list; - bool retVal = getAndDelOption(arguments, argument, list); - std::vector shapeList{splitToStringVec(list, ',')}; - for (const auto& s : shapeList) - { - auto nameDimsPair = splitNameAndValue>(s); - auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); - auto dims = nameDimsPair.second; - insertShapesBuild(shapes, selector, tensorName, dims); - } - return retVal; -} - -bool getShapesInference(Arguments& arguments, std::unordered_map>& shapes, const char* argument) -{ - std::string list; - bool retVal = getAndDelOption(arguments, argument, list); - std::vector shapeList{splitToStringVec(list, ',')}; - for (const auto& s : shapeList) - { - auto nameDimsPair = splitNameAndValue>(s); - auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); - auto dims = nameDimsPair.second; - insertShapesInference(shapes, tensorName, dims); - } - return retVal; -} - -void processShapes(std::unordered_map& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib) -{ - // Only accept optShapes only or all three of minShapes, optShapes, maxShapes - if ( ((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes only, both minShapes and maxShapes - || (minShapes && !maxShapes && optShapes) // both minShapes and optShapes - || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes - { - if (calib) - { - throw std::invalid_argument("Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib"); - } - else - { - throw std::invalid_argument("Must specify only --optShapes or all of --minShapes, --optShapes, --maxShapes"); - } - } - - // If optShapes only, expand optShapes to minShapes and maxShapes - if (optShapes && !minShapes && !maxShapes) - { - std::unordered_map newShapes; - for (auto& s : shapes) - { - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - } - shapes = newShapes; - } -} - -template -void printShapes(std::ostream& os, const char* phase, const T& shapes) -{ - if (shapes.empty()) - { - os << "Input " << phase << " shapes: model" << std::endl; - } - else - { - for (const auto& s : shapes) - { - os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl; - } - } -} - -std::ostream& printBatch(std::ostream& os, int32_t maxBatch) -{ - if (maxBatch != maxBatchNotProvided) - { - os << maxBatch; - } - else - { - os << "explicit batch"; - } - return os; -} - -std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources) -{ - if (!enabledSources && !disabledSources) - { - os << "Using default tactic sources"; - } - else - { - auto const addSource = [&](uint32_t source, std::string const& name) { - if (enabledSources & source) - { - os << name << " [ON], "; - } - else if (disabledSources & source) - { - os << name << " [OFF], "; - } - }; - - addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS), "cublas"); - addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt"); -#if (NV_TENSORRT_MAJOR > 7) - addSource(1U << static_cast(nvinfer1::TacticSource::kCUDNN), "cudnn"); -#endif - } - return os; -} - -std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) -{ - os << "FP32"; - if (options.fp16) - { - os << "+FP16"; - } - if (options.int8) - { - os << "+INT8"; - } - if (options.precisionConstraints == PrecisionConstraints::kOBEY) - { - os << " (obey precision constraints)"; - } - if (options.precisionConstraints == PrecisionConstraints::kPREFER) - { - os << " (prefer precision constraints)"; - } - return os; -} - -std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options) -{ - switch (options.timingCacheMode) - { - case TimingCacheMode::kGLOBAL: os << "global"; break; - case TimingCacheMode::kLOCAL: os << "local"; break; - case TimingCacheMode::kDISABLE: os << "disable"; break; - } - return os; -} - -std::ostream& printSparsity(std::ostream& os, BuildOptions const& options) -{ - switch (options.sparsity) - { - case SparsityFlag::kDISABLE: os << "Disabled"; break; - case SparsityFlag::kENABLE: os << "Enabled"; break; - case SparsityFlag::kFORCE: os << "Forced"; break; - } - - return os; -} - -std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options) -{ - auto const printValueOrDefault = [&os](double const val) { - if (val >= 0) - { - os << val << " MiB"; - } - else - { - os << "default"; - } - }; - os << "workspace: "; printValueOrDefault(options.workspace); os << ", "; - os << "dlaSRAM: "; printValueOrDefault(options.dlaSRAM); os << ", "; - os << "dlaLocalDRAM: "; printValueOrDefault(options.dlaLocalDRAM); os << ", "; - os << "dlaGlobalDRAM: "; printValueOrDefault(options.dlaGlobalDRAM); - return os; -} - -} // namespace - -Arguments argsToArgumentsMap(int32_t argc, char* argv[]) -{ - Arguments arguments; - for (int32_t i = 1; i < argc; ++i) - { - auto valuePtr = strchr(argv[i], '='); - if (valuePtr) - { - std::string value{valuePtr + 1}; - arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value); - } - else - { - arguments.emplace(argv[i], ""); - } - } - return arguments; -} - -void BaseModelOptions::parse(Arguments& arguments) -{ - if (getAndDelOption(arguments, "--onnx", model)) - { - format = ModelFormat::kONNX; - } - else if (getAndDelOption(arguments, "--uff", model)) - { - format = ModelFormat::kUFF; - } - else if (getAndDelOption(arguments, "--model", model)) - { - format = ModelFormat::kCAFFE; - } -} - -void UffInput::parse(Arguments& arguments) -{ - getAndDelOption(arguments, "--uffNHWC", NHWC); - std::vector args; - if (getAndDelRepeatedOption(arguments, "--uffInput", args)) - { - for (const auto& i : args) - { - std::vector values{splitToStringVec(i, ',')}; - if (values.size() == 4) - { - nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])}; - inputs.emplace_back(values[0], dims); - } - else - { - throw std::invalid_argument(std::string("Invalid uffInput ") + i); - } - } - } -} - -void ModelOptions::parse(Arguments& arguments) -{ - baseModel.parse(arguments); - - switch (baseModel.format) - { - case ModelFormat::kCAFFE: - { - getAndDelOption(arguments, "--deploy", prototxt); - break; - } - case ModelFormat::kUFF: - { - uffInputs.parse(arguments); - if (uffInputs.inputs.empty()) - { - throw std::invalid_argument("Uff models require at least one input"); - } - break; - } - case ModelFormat::kONNX: - break; - case ModelFormat::kANY: - { - if (getAndDelOption(arguments, "--deploy", prototxt)) - { - baseModel.format = ModelFormat::kCAFFE; - } - break; - } - } - - // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX. - std::vector outArgs; - if (getAndDelRepeatedOption(arguments, "--output", outArgs)) - { - for (const auto& o : outArgs) - { - for (auto& v : splitToStringVec(o, ',')) - { - outputs.emplace_back(std::move(v)); - } - } - } - if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF) - { - if (outputs.empty()) - { - throw std::invalid_argument("Caffe and Uff models require at least one output"); - } - } - else if (baseModel.format == ModelFormat::kONNX) - { - if (!outputs.empty()) - { - throw std::invalid_argument("The --output flag should not be used with ONNX models."); - } - } -} - -void BuildOptions::parse(Arguments& arguments) -{ - auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { - std::string list; - getAndDelOption(arguments, argument, list); - std::vector formats{splitToStringVec(list, ',')}; - for (const auto& f : formats) - { - formatsVector.push_back(stringToValue(f)); - } - }; - - getFormats(inputFormats, "--inputIOFormats"); - getFormats(outputFormats, "--outputIOFormats"); - - bool addedExplicitBatchFlag{false}; - getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag); - if (addedExplicitBatchFlag) - { - sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl; - sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic " - << "shapes are provided when the engine is built." << std::endl; - } - - bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); - bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); - bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); - processShapes(shapes, minShapes, optShapes, maxShapes, false); - bool minShapesCalib - = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN); - bool optShapesCalib - = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT); - bool maxShapesCalib - = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX); - processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true); - - bool addedExplicitPrecisionFlag{false}; - getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag); - if (addedExplicitPrecisionFlag) - { - sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl; - } - - if (getAndDelOption(arguments, "--workspace", workspace)) - { - sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl; - } - - std::string memPoolSizes; - getAndDelOption(arguments, "--memPoolSize", memPoolSizes); - std::vector memPoolSpecs{splitToStringVec(memPoolSizes, ',')}; - for (auto const& memPoolSpec : memPoolSpecs) - { - std::string memPoolName; - double memPoolSize; - std::tie(memPoolName, memPoolSize) = splitNameAndValue(memPoolSpec); - if (memPoolSize < 0) - { - throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize)); - } - if (memPoolName == "workspace") - { - workspace = memPoolSize; - } - else if (memPoolName == "dlaSRAM") - { - dlaSRAM = memPoolSize; - } - else if (memPoolName == "dlaLocalDRAM") - { - dlaLocalDRAM = memPoolSize; - } - else if (memPoolName == "dlaGlobalDRAM") - { - dlaGlobalDRAM = memPoolSize; - } - else if (!memPoolName.empty()) - { - throw std::invalid_argument(std::string("Unknown memory pool: ") + memPoolName); - } - } - - getAndDelOption(arguments, "--maxBatch", maxBatch); - getAndDelOption(arguments, "--minTiming", minTiming); - getAndDelOption(arguments, "--avgTiming", avgTiming); - - bool best{false}; - getAndDelOption(arguments, "--best", best); - if (best) - { - int8 = true; - fp16 = true; - } - - getAndDelOption(arguments, "--refit", refittable); - getAndDelNegOption(arguments, "--noTF32", tf32); - getAndDelOption(arguments, "--fp16", fp16); - getAndDelOption(arguments, "--int8", int8); - getAndDelOption(arguments, "--safe", safe); - getAndDelOption(arguments, "--consistency", consistency); - getAndDelOption(arguments, "--restricted", restricted); - - getAndDelOption(arguments, "--directIO", directIO); - - std::string precisionConstraintsString; - getAndDelOption(arguments, "--precisionConstraints", precisionConstraintsString); - if (!precisionConstraintsString.empty()) - { - const std::unordered_map precisionConstraintsMap - = {{"obey", PrecisionConstraints::kOBEY}, {"prefer", PrecisionConstraints::kPREFER}, - {"none", PrecisionConstraints::kNONE}}; - auto it = precisionConstraintsMap.find(precisionConstraintsString); - if (it == precisionConstraintsMap.end()) - { - throw std::invalid_argument(std::string("Unknown precision constraints: ") + precisionConstraintsString); - } - precisionConstraints = it->second; - } - else - { - precisionConstraints = PrecisionConstraints::kNONE; - } - - getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions); - getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes); - - if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE) - { - sample::gLogWarning << "When --precisionConstraints flag is set to \"obey\" or \"prefer\", please add " - << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output " - << "types." << std::endl; - } - else if ((!layerPrecisions.empty() || !layerOutputTypes.empty()) - && precisionConstraints == PrecisionConstraints::kNONE) - { - sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints " - << "flag is set to \"none\"." << std::endl; - } - - std::string sparsityString; - getAndDelOption(arguments, "--sparsity", sparsityString); - if (sparsityString == "disable") - { - sparsity = SparsityFlag::kDISABLE; - } - else if (sparsityString == "enable") - { - sparsity = SparsityFlag::kENABLE; - } - else if (sparsityString == "force") - { - sparsity = SparsityFlag::kFORCE; - } - else if (!sparsityString.empty()) - { - throw std::invalid_argument(std::string("Unknown sparsity mode: ") + sparsityString); - } - - bool calibCheck = getAndDelOption(arguments, "--calib", calibration); - if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty()) - { - shapesCalib = shapes; - } - - std::string profilingVerbosityString; - if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString)) - { - sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl; - } - - getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString); - if (profilingVerbosityString == "layer_names_only") - { -#if (NV_TENSORRT_MAJOR > 7) - profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif - } - else if (profilingVerbosityString == "none") - { - profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE; - } -#if (NV_TENSORRT_MAJOR > 7) - else if (profilingVerbosityString == "detailed") - { - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; - } -#endif - else if (profilingVerbosityString == "default") - { -#if (NV_TENSORRT_MAJOR > 7) - sample::gLogWarning << "--profilingVerbosity=default has been deprecated by " - "--profilingVerbosity=layer_names_only." - << std::endl; - profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif - } - else if (profilingVerbosityString == "verbose") - { -#if (NV_TENSORRT_MAJOR > 7) - sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed." - << std::endl; - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; -#else - profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT; -#endif - } - else if (!profilingVerbosityString.empty()) - { - throw std::invalid_argument(std::string("Unknown profilingVerbosity: ") + profilingVerbosityString); - } - - if (getAndDelOption(arguments, "--loadEngine", engine)) - { - load = true; - } - if (getAndDelOption(arguments, "--saveEngine", engine)) - { - save = true; - } - if (load && save) - { - throw std::invalid_argument("Incompatible load and save engine options selected"); - } - - std::string tacticSourceArgs; - if (getAndDelOption(arguments, "--tacticSources", tacticSourceArgs)) - { - std::vector tacticList = splitToStringVec(tacticSourceArgs, ','); - for (auto& t : tacticList) - { - bool enable{false}; - if (t.front() == '+') - { - enable = true; - } - else if (t.front() != '-') - { - throw std::invalid_argument( - "Tactic source must be prefixed with + or -, indicating whether it should be enabled or disabled " - "respectively."); - } - t.erase(0, 1); - - const auto toUpper = [](std::string& sourceName) { - std::transform( - sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); }); - return sourceName; - }; - - nvinfer1::TacticSource source{}; - t = toUpper(t); - if (t == "CUBLAS") - { - source = nvinfer1::TacticSource::kCUBLAS; - } - else if (t == "CUBLASLT" || t == "CUBLAS_LT") - { - source = nvinfer1::TacticSource::kCUBLAS_LT; - } -#if (NV_TENSORRT_MAJOR > 7) - else if (t == "CUDNN") - { - source = nvinfer1::TacticSource::kCUDNN; - } -#endif - else - { - throw std::invalid_argument(std::string("Unknown tactic source: ") + t); - } - - uint32_t sourceBit = 1U << static_cast(source); - - if (enable) - { - enabledTactics |= sourceBit; - } - else - { - disabledTactics |= sourceBit; - } - - if (enabledTactics & disabledTactics) - { - throw std::invalid_argument(std::string("Cannot enable and disable ") + t); - } - } - } - - bool noBuilderCache{false}; - getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); - getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); - if (noBuilderCache) - { - timingCacheMode = TimingCacheMode::kDISABLE; - } - else if (!timingCacheFile.empty()) - { - timingCacheMode = TimingCacheMode::kGLOBAL; - } - else - { - timingCacheMode = TimingCacheMode::kLOCAL; - } -} - -void SystemOptions::parse(Arguments& arguments) -{ - getAndDelOption(arguments, "--device", device); - getAndDelOption(arguments, "--useDLACore", DLACore); - getAndDelOption(arguments, "--allowGPUFallback", fallback); - std::string pluginName; - while (getAndDelOption(arguments, "--plugins", pluginName)) - { - plugins.emplace_back(pluginName); - } -} - -void InferenceOptions::parse(Arguments& arguments) -{ - getAndDelOption(arguments, "--streams", streams); - getAndDelOption(arguments, "--iterations", iterations); - getAndDelOption(arguments, "--duration", duration); - getAndDelOption(arguments, "--warmUp", warmup); - getAndDelOption(arguments, "--sleepTime", sleep); - getAndDelOption(arguments, "--idleTime", idle); - bool exposeDMA{false}; - if (getAndDelOption(arguments, "--exposeDMA", exposeDMA)) - { - overlap = !exposeDMA; - } - getAndDelOption(arguments, "--noDataTransfers", skipTransfers); - getAndDelOption(arguments, "--useManagedMemory", useManaged); - getAndDelOption(arguments, "--useSpinWait", spin); - getAndDelOption(arguments, "--threads", threads); - getAndDelOption(arguments, "--useCudaGraph", graph); - getAndDelOption(arguments, "--separateProfileRun", rerun); - getAndDelOption(arguments, "--buildOnly", skip); - getAndDelOption(arguments, "--timeDeserialize", timeDeserialize); - getAndDelOption(arguments, "--timeRefit", timeRefit); - - std::string list; - getAndDelOption(arguments, "--loadInputs", list); - std::vector inputsList{splitToStringVec(list, ',')}; - splitInsertKeyValue(inputsList, inputs); - - getShapesInference(arguments, shapes, "--shapes"); - getAndDelOption(arguments, "--batch", batch); -} - -void ReportingOptions::parse(Arguments& arguments) -{ - getAndDelOption(arguments, "--percentile", percentile); - getAndDelOption(arguments, "--avgRuns", avgs); - getAndDelOption(arguments, "--verbose", verbose); - getAndDelOption(arguments, "--dumpRefit", refit); - getAndDelOption(arguments, "--dumpOutput", output); - getAndDelOption(arguments, "--dumpProfile", profile); - getAndDelOption(arguments, "--dumpLayerInfo", layerInfo); - getAndDelOption(arguments, "--exportTimes", exportTimes); - getAndDelOption(arguments, "--exportOutput", exportOutput); - getAndDelOption(arguments, "--exportProfile", exportProfile); - getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo); - if (percentile < 0 || percentile > 100) - { - throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]"); - } -} - -bool parseHelp(Arguments& arguments) -{ - bool helpLong{false}; - bool helpShort{false}; - getAndDelOption(arguments, "--help", helpLong); - getAndDelOption(arguments, "-h", helpShort); - return helpLong || helpShort; -} - -void AllOptions::parse(Arguments& arguments) -{ - model.parse(arguments); - build.parse(arguments); - system.parse(arguments); - inference.parse(arguments); - - // Use explicitBatch when input model is ONNX or when dynamic shapes are used. - const bool isOnnx{model.baseModel.format == ModelFormat::kONNX}; - const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()}; - const bool detectedExplicitBatch = isOnnx || hasDynamicShapes; - - // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim. - const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided}; - const bool batchWasSet{inference.batch != batchNotProvided}; - if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet)) - { - throw std::invalid_argument( - "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes " - "are provided. Please use --optShapes and --shapes to set input shapes instead."); - } - - // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values. - if (!detectedExplicitBatch) - { - // If batch is not set, set it to default value. - if (!batchWasSet) - { - inference.batch = defaultBatch; - } - // If maxBatch is not set, set it to be equal to batch. - if (!maxBatchWasSet) - { - build.maxBatch = inference.batch; - } - // MaxBatch should not be less than batch. - if (build.maxBatch < inference.batch) - { - throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch) - + " is less than inference batch " + std::to_string(inference.batch)); - } - } - - if (build.shapes.empty() && !inference.shapes.empty()) - { - // If --shapes are provided but --optShapes are not, assume that optShapes is the same as shapes. - for (auto& s : inference.shapes) - { - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second); - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second); - insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second); - } - } - else if (!build.shapes.empty() && inference.shapes.empty()) - { - // If --optShapes are provided but --shapes are not, assume that shapes is the same as optShapes. - for (auto& s : build.shapes) - { - insertShapesInference( - inference.shapes, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); - } - } - - reporting.parse(arguments); - helps = parseHelp(arguments); - - if (!helps) - { - if (!build.load && model.baseModel.format == ModelFormat::kANY) - { - throw std::invalid_argument("Model missing or format not recognized"); - } - if (build.safe && system.DLACore >= 0) - { - auto checkSafeDLAFormats = [](std::vector const& fmt) { - return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) { - bool supported{false}; - bool const isLINEAR{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kLINEAR)}; - bool const isCHW4{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW4)}; - bool const isCHW32{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW32)}; - bool const isCHW16{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW16)}; - supported |= pair.first == nvinfer1::DataType::kINT8 && (isLINEAR || isCHW4 || isCHW32); - supported |= pair.first == nvinfer1::DataType::kHALF && (isLINEAR || isCHW4 || isCHW16); - return supported; - }); - }; - if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats)) - { - throw std::invalid_argument( - "I/O formats for safe DLA capability are restricted to fp16/int8:linear, fp16:chw16 or int8:chw32"); - } - if (system.fallback) - { - throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability"); - } - } - } -} - -void SafeBuilderOptions::parse(Arguments& arguments) -{ - auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { - std::string list; - getAndDelOption(arguments, argument, list); - std::vector formats{splitToStringVec(list, ',')}; - for (const auto& f : formats) - { - formatsVector.push_back(stringToValue(f)); - } - }; - - getAndDelOption(arguments, "--serialized", serialized); - getAndDelOption(arguments, "--onnx", onnxModelFile); - getAndDelOption(arguments, "--help", help); - getAndDelOption(arguments, "-h", help); - getAndDelOption(arguments, "--verbose", verbose); - getAndDelOption(arguments, "-v", verbose); - getFormats(inputFormats, "--inputIOFormats"); - getFormats(outputFormats, "--outputIOFormats"); - getAndDelOption(arguments, "--int8", int8); - getAndDelOption(arguments, "--calib", calibFile); - getAndDelOption(arguments, "--consistency", consistency); - getAndDelOption(arguments, "--std", standard); - std::string pluginName; - while (getAndDelOption(arguments, "--plugins", pluginName)) - { - plugins.emplace_back(pluginName); - } -} - -std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) -{ - os << "=== Model Options ===" << std::endl; - - os << "Format: "; - switch (options.format) - { - case ModelFormat::kCAFFE: - { - os << "Caffe"; - break; - } - case ModelFormat::kONNX: - { - os << "ONNX"; - break; - } - case ModelFormat::kUFF: - { - os << "UFF"; - break; - } - case ModelFormat::kANY: - os << "*"; - break; - } - os << std::endl << "Model: " << options.model << std::endl; - - return os; -} - -std::ostream& operator<<(std::ostream& os, const UffInput& input) -{ - os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl; - for (const auto& i : input.inputs) - { - os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl; - } - - return os; -} - -std::ostream& operator<<(std::ostream& os, const ModelOptions& options) -{ - os << options.baseModel; - switch (options.baseModel.format) - { - case ModelFormat::kCAFFE: - { - os << "Prototxt: " << options.prototxt << std::endl; - break; - } - case ModelFormat::kUFF: - { - os << options.uffInputs; - break; - } - case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case - case ModelFormat::kANY: - break; - } - - os << "Output:"; - for (const auto& o : options.outputs) - { - os << " " << o; - } - os << std::endl; - - return os; -} - -std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) -{ - switch (dtype) - { - case nvinfer1::DataType::kFLOAT: - { - os << "fp32"; - break; - } - case nvinfer1::DataType::kHALF: - { - os << "fp16"; - break; - } - case nvinfer1::DataType::kINT8: - { - os << "int8"; - break; - } - case nvinfer1::DataType::kINT32: - { - os << "int32"; - break; - } - case nvinfer1::DataType::kBOOL: - { - os << "bool"; - break; - } - } - return os; -} - -std::ostream& operator<<(std::ostream& os, IOFormat const& format) -{ - os << format.first << ":"; - - for (int32_t f = 0; f < nvinfer1::EnumMax(); ++f) - { - if ((1U << f) & format.second) - { - if (f) - { - os << "+"; - } - switch (nvinfer1::TensorFormat(f)) - { - case nvinfer1::TensorFormat::kLINEAR: - { - os << "chw"; - break; - } - case nvinfer1::TensorFormat::kCHW2: - { - os << "chw2"; - break; - } - case nvinfer1::TensorFormat::kHWC8: - { - os << "hwc8"; - break; - } -#if (NV_TENSORRT_MAJOR > 7) - case nvinfer1::TensorFormat::kHWC16: - { - os << "hwc16"; - break; - } -#endif - case nvinfer1::TensorFormat::kCHW4: - { - os << "chw4"; - break; - } - case nvinfer1::TensorFormat::kCHW16: - { - os << "chw16"; - break; - } - case nvinfer1::TensorFormat::kCHW32: - { - os << "chw32"; - break; - } - case nvinfer1::TensorFormat::kDHWC8: - { - os << "dhwc8"; - break; - } - case nvinfer1::TensorFormat::kCDHW32: - { - os << "cdhw32"; - break; - } - case nvinfer1::TensorFormat::kHWC: - { - os << "hwc"; - break; - } - case nvinfer1::TensorFormat::kDLA_LINEAR: - { - os << "dla_linear"; - break; - } - case nvinfer1::TensorFormat::kDLA_HWC4: - { - os << "dla_hwc4"; - break; - } - } - } - } - return os; -} - -std::ostream& operator<<(std::ostream& os, const ShapeRange& dims) -{ - int32_t i = 0; - for (const auto& d : dims) - { - if (!d.size()) - { - break; - } - os << (i ? "+" : "") << d; - ++i; - } - return os; -} - -std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecisions) -{ - int32_t i = 0; - for (auto const& layerPrecision : layerPrecisions) - { - os << (i ? "," : "") << layerPrecision.first << ":" << layerPrecision.second; - ++i; - } - return os; -} - -std::ostream& operator<<(std::ostream& os, const BuildOptions& options) -{ - // clang-format off - os << "=== Build Options ===" << std::endl << - - "Max batch: "; printBatch(os, options.maxBatch) << std::endl << - "Memory Pools: "; printMemoryPools(os, options) << std::endl << - "minTiming: " << options.minTiming << std::endl << - "avgTiming: " << options.avgTiming << std::endl << - "Precision: "; printPrecision(os, options) << std::endl << - "LayerPrecisions: " << options.layerPrecisions << std::endl << - "Calibration: " << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl << - "Refit: " << boolToEnabled(options.refittable) << std::endl << - "Sparsity: "; printSparsity(os, options) << std::endl << - "Safe mode: " << boolToEnabled(options.safe) << std::endl << - "DirectIO mode: " << boolToEnabled(options.directIO) << std::endl << - "Restricted mode: " << boolToEnabled(options.restricted) << std::endl << - "Save engine: " << (options.save ? options.engine : "") << std::endl << - "Load engine: " << (options.load ? options.engine : "") << std::endl << - "Profiling verbosity: " << static_cast(options.profilingVerbosity) << std::endl << - "Tactic sources: "; printTacticSources(os, options.enabledTactics, options.disabledTactics) << std::endl << - "timingCacheMode: "; printTimingCache(os, options) << std::endl << - "timingCacheFile: " << options.timingCacheFile << std::endl; - // clang-format on - - auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { - if (formats.empty()) - { - os << direction << "s format: fp32:CHW" << std::endl; - } - else - { - for(const auto& f : formats) - { - os << direction << ": " << f << std::endl; - } - } - }; - - printIOFormats(os, "Input(s)", options.inputFormats); - printIOFormats(os, "Output(s)", options.outputFormats); - printShapes(os, "build", options.shapes); - printShapes(os, "calibration", options.shapesCalib); - - return os; -} - -std::ostream& operator<<(std::ostream& os, const SystemOptions& options) -{ - // clang-format off - os << "=== System Options ===" << std::endl << - - "Device: " << options.device << std::endl << - "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << - (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl; - os << "Plugins:"; - - for (const auto& p : options.plugins) - { - os << " " << p; - } - os << std::endl; - - return os; - // clang-format on -} - -std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) -{ -// clang-format off - os << "=== Inference Options ===" << std::endl << - - "Batch: "; - if (options.batch && options.shapes.empty()) - { - os << options.batch << std::endl; - } - else - { - os << "Explicit" << std::endl; - } - printShapes(os, "inference", options.shapes); - os << "Iterations: " << options.iterations << std::endl << - "Duration: " << options.duration << "s (+ " - << options.warmup << "ms warm up)" << std::endl << - "Sleep time: " << options.sleep << "ms" << std::endl << - "Idle time: " << options.idle << "ms" << std::endl << - "Streams: " << options.streams << std::endl << - "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << - "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << - "Spin-wait: " << boolToEnabled(options.spin) << std::endl << - "Multithreading: " << boolToEnabled(options.threads) << std::endl << - "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << - "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << - "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << - "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << - "Skip inference: " << boolToEnabled(options.skip) << std::endl; - -// clang-format on - os << "Inputs:" << std::endl; - for (const auto& input : options.inputs) - { - os << input.first << "<-" << input.second << std::endl; - } - - return os; -} - -std::ostream& operator<<(std::ostream& os, const ReportingOptions& options) -{ -// clang-format off - os << "=== Reporting Options ===" << std::endl << - - "Verbose: " << boolToEnabled(options.verbose) << std::endl << - "Averages: " << options.avgs << " inferences" << std::endl << - "Percentile: " << options.percentile << std::endl << - "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << - "Dump output: " << boolToEnabled(options.output) << std::endl << - "Profile: " << boolToEnabled(options.profile) << std::endl << - "Export timing to JSON file: " << options.exportTimes << std::endl << - "Export output to JSON file: " << options.exportOutput << std::endl << - "Export profile to JSON file: " << options.exportProfile << std::endl; -// clang-format on - - return os; -} - -std::ostream& operator<<(std::ostream& os, const AllOptions& options) -{ - os << options.model << options.build << options.system << options.inference << options.reporting << std::endl; - return os; -} - -std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) -{ - auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { - if (formats.empty()) - { - os << direction << "s format: fp32:CHW" << std::endl; - } - else - { - for(const auto& f : formats) - { - os << direction << ": " << f << std::endl; - } - } - }; - - os << "=== Build Options ===" << std::endl; - os << "Model ONNX: " << options.onnxModelFile << std::endl; - - os << "Precision: FP16"; - if (options.int8) - { - os << " + INT8"; - } - os << std::endl; - os << "Calibration file: " << options.calibFile << std::endl; - os << "Serialized Network: " << options.serialized << std::endl; - - printIOFormats(os, "Input(s)", options.inputFormats); - printIOFormats(os, "Output(s)", options.outputFormats); - - os << "Plugins:"; - for (const auto& p : options.plugins) - { - os << " " << p; - } - os << std::endl; - return os; -} - -void BaseModelOptions::help(std::ostream& os) -{ -// clang-format off - os << " --uff= UFF model" << std::endl << - " --onnx= ONNX model" << std::endl << - " --model= Caffe model (default = no model, random weights used)" << std::endl; -// clang-format on -} - -void UffInput::help(std::ostream& os) -{ -// clang-format off - os << " --uffInput=,X,Y,Z Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified " - "multiple times; at least one is required for UFF models" << std::endl << - " --uffNHWC Set if inputs are in the NHWC layout instead of NCHW (use " << - "X,Y,Z=H,W,C order in --uffInput)" << std::endl; -// clang-format on -} - -void ModelOptions::help(std::ostream& os) -{ -// clang-format off - os << "=== Model Options ===" << std::endl; - BaseModelOptions::help(os); - os << " --deploy= Caffe prototxt file" << std::endl << - " --output=[,]* Output names (it can be specified multiple times); at least one output " - "is required for UFF and Caffe" << std::endl; - UffInput::help(os); -// clang-format on -} - -void BuildOptions::help(std::ostream& os) -{ -// clang-format off - os << "=== Build Options ===" "\n" - " --maxBatch Set max batch size and build an implicit batch engine (default = same size as --batch)" "\n" - " This option should not be used when the input model is ONNX or when dynamic shapes are provided." "\n" - " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" - " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" - " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" - " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" - " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" - " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" - " Note: All three of min, opt and max shapes must be supplied." "\n" - " However, if only opt shapes is supplied then it will be expanded so" "\n" - " that min shapes and max shapes are set to the same values as opt shapes." "\n" - " Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." "\n" - " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" - " Each input shape is supplied as a key-value pair where key is the input name and" "\n" - " value is the dimensions (including the batch dimension) to be used for that input." "\n" - " Each key-value pair has the key and value separated using a colon (:)." "\n" - " Multiple input shapes can be provided via comma-separated key-value pairs." "\n" - " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" - " See --outputIOFormats help for the grammar of type and format list." "\n" - " Note: If this option is specified, please set comma-separated types and formats for all" "\n" - " inputs following the same order as network inputs ID (even if only one input" "\n" - " needs specifying IO format) or set the type and format once for broadcasting." "\n" - " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" - " Note: If this option is specified, please set comma-separated types and formats for all" "\n" - " outputs following the same order as network outputs ID (even if only one output" "\n" - " needs specifying IO format) or set the type and format once for broadcasting." "\n" - " IO Formats: spec ::= IOfmt[\",\"spec]" "\n" - " IOfmt ::= type:fmt" "\n" - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" - " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n" - " --workspace=N Set workspace size in MiB." "\n" - " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s) in MiB." "\n" - " Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n" - " Pool constraint: poolspec ::= poolfmt[\",\"poolspec]" "\n" - " poolfmt ::= pool:sizeInMiB" "\n" - " pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\"" "\n" - " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)" "\n" - " --minTiming=M Set the minimum number of iterations used in kernel selection (default = " - << defaultMinTiming << ")" "\n" - " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " - << defaultAvgTiming << ")" "\n" - " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" - " and weights within the engine." "\n" - " --sparsity=spec Control sparsity (default = disabled). " "\n" - " Sparsity: spec ::= \"disable\", \"enable\", \"force\"" "\n" - " Note: Description about each of these options is as below" "\n" - " disable = do not enable sparse tactics in the builder (this is the default)" "\n" - " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" - " considered if the weights have the right sparsity pattern)" "\n" - " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" - " a sparsity pattern (even if you loaded a model yourself)" "\n" - " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" - " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" - " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" - " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" - " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" - " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" - " Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\"" "\n" - " none = no constraints" "\n" - " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" - " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" - " otherwise" "\n" - " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" - " \"obey\" or \"prefer\". (default = none)" "\n" - " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" - " layerName to specify the default precision for all the unspecified layers." "\n" - " Per-layer precision spec ::= layerPrecision[\",\"spec]" "\n" - " layerPrecision ::= layerName\":\"precision" "\n" - " precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" - " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" - " \"obey\" or \"prefer\". (default = none)" "\n" - " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" - " layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n" - " one output, then multiple types separated by \"+\" can be provided for this layer." "\n" - " Per-layer output type spec ::= layerOutputTypes[\",\"spec]" "\n" - " layerOutputTypes ::= layerName\":\"type" "\n" - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]" "\n" - " --calib= Read INT8 calibration cache file" "\n" - " --safe Enable build safety certified engine" "\n" - " --consistency Perform consistency checking on safety certified engine" "\n" - " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" - " --saveEngine= Save the serialized engine" "\n" - " --loadEngine= Load a serialized engine" "\n" - " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" - " tactic sources (default = all available tactics)." "\n" - " Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics." "\n" - " Tactic Sources: tactics ::= [\",\"tactic]" "\n" - " tactic ::= (+|-)lib" "\n" - " lib ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\"" "\n" - " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" - " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" - " --timingCacheFile= Save/load the serialized global timing cache" "\n" - ; -// clang-format on - os << std::flush; -} - -void SystemOptions::help(std::ostream& os) -{ -// clang-format off - os << "=== System Options ===" << std::endl << - " --device=N Select cuda device N (default = " << defaultDevice << ")" << std::endl << - " --useDLACore=N Select DLA core N for layers that support DLA (default = none)" << std::endl << - " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers " - "(default = disabled)" << std::endl; - os << " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl; -// clang-format on -} - -void InferenceOptions::help(std::ostream& os) -{ - // clang-format off - os << "=== Inference Options ===" << std::endl << - " --batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << - " This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl << - " shapes are provided when the engine is built." << std::endl << - " --shapes=spec Set input shapes for dynamic shapes inference inputs." << std::endl << - " Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." << std::endl << - " Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128" << std::endl << - " Each input shape is supplied as a key-value pair where key is the input name and" << std::endl << - " value is the dimensions (including the batch dimension) to be used for that input." << std::endl << - " Each key-value pair has the key and value separated using a colon (:)." << std::endl << - " Multiple input shapes can be provided via comma-separated key-value pairs." << std::endl << - " --loadInputs=spec Load input values from files (default = generate random inputs). Input names can be " - "wrapped with single quotes (ex: 'Input:0')" << std::endl << - " Input values spec ::= Ival[\",\"spec]" << std::endl << - " Ival ::= name\":\"file" << std::endl << - " --iterations=N Run at least N inference iterations (default = " << defaultIterations << ")" << std::endl << - " --warmUp=N Run for N milliseconds to warmup before measuring performance (default = " - << defaultWarmUp << ")" << std::endl << - " --duration=N Run performance measurements for at least N seconds wallclock time (default = " - << defaultDuration << ")" << std::endl << - " --sleepTime=N Delay inference start with a gap of N milliseconds between launch and compute " - "(default = " << defaultSleep << ")" << std::endl << - " --idleTime=N Sleep N milliseconds between two continuous iterations" - "(default = " << defaultIdle << ")" << std::endl << - " --streams=N Instantiate N engines to use concurrently (default = " << defaultStreams << ")" << std::endl << - " --exposeDMA Serialize DMA transfers to and from device (default = disabled)." << std::endl << - " --noDataTransfers Disable DMA transfers to and from device (default = enabled)." << std::endl << - " --useManagedMemory Use managed memory instead of seperate host and device allocations (default = disabled)." << std::endl << - " --useSpinWait Actively synchronize on GPU events. This option may decrease synchronization time but " - "increase CPU usage and power (default = disabled)" << std::endl << - " --threads Enable multithreading to drive engines with independent threads" - " or speed up refitting (default = disabled) " << std::endl << - " --useCudaGraph Use CUDA graph to capture engine execution and then launch inference (default = disabled)." << std::endl << - " This flag may be ignored if the graph capture fails." << std::endl << - " --timeDeserialize Time the amount of time it takes to deserialize the network and exit." << std::endl << - " --timeRefit Time the amount of time it takes to refit the engine before inference." << std::endl << - " --separateProfileRun Do not attach the profiler in the benchmark run; if profiling is enabled, a second " - "profile run will be executed (default = disabled)" << std::endl << - " --buildOnly Skip inference perf measurement (default = disabled)" << std::endl; - // clang-format on -} - -void ReportingOptions::help(std::ostream& os) -{ -// clang-format off - os << "=== Reporting Options ===" << std::endl << - " --verbose Use verbose logging (default = false)" << std::endl << - " --avgRuns=N Report performance measurements averaged over N consecutive " - "iterations (default = " << defaultAvgRuns << ")" << std::endl << - " --percentile=P Report performance for the P percentage (0<=P<=100, 0 " - "representing max perf, and 100 representing min perf; (default" - " = " << defaultPercentile << "%)" << std::endl << - " --dumpRefit Print the refittable layers and weights from a refittable " - "engine" << std::endl << - " --dumpOutput Print the output tensor(s) of the last inference iteration " - "(default = disabled)" << std::endl << - " --dumpProfile Print profile information per layer (default = disabled)" << std::endl << - " --dumpLayerInfo Print layer information of the engine to console " - "(default = disabled)" << std::endl << - " --exportTimes= Write the timing results in a json file (default = disabled)" << std::endl << - " --exportOutput= Write the output tensors to a json file (default = disabled)" << std::endl << - " --exportProfile= Write the profile information per layer in a json file " - "(default = disabled)" << std::endl << - " --exportLayerInfo= Write the layer information of the engine in a json file " - "(default = disabled)" << std::endl; -// clang-format on -} - -void helpHelp(std::ostream& os) -{ -// clang-format off - os << "=== Help ===" << std::endl << - " --help, -h Print this message" << std::endl; -// clang-format on -} - -void AllOptions::help(std::ostream& os) -{ - ModelOptions::help(os); - os << std::endl; - BuildOptions::help(os); - os << std::endl; - InferenceOptions::help(os); - os << std::endl; -// clang-format off - os << "=== Build and Inference Batch Options ===" << std::endl << - " When using implicit batch, the max batch size of the engine, if not given, " << std::endl << - " is set to the inference batch size;" << std::endl << - " when using explicit batch, if shapes are specified only for inference, they " << std::endl << - " will be used also as min/opt/max in the build profile; if shapes are " << std::endl << - " specified only for the build, the opt shapes will be used also for inference;" << std::endl << - " if both are specified, they must be compatible; and if explicit batch is " << std::endl << - " enabled but neither is specified, the model must provide complete static" << std::endl << - " dimensions, including batch size, for all inputs" << std::endl << - " Using ONNX models automatically forces explicit batch." << std::endl << - std::endl; - // clang-format on - ReportingOptions::help(os); - os << std::endl; - SystemOptions::help(os); - os << std::endl; - helpHelp(os); -} - -void SafeBuilderOptions::printHelp(std::ostream& os) -{ -// clang-format off - os << "=== Mandatory ===" << std::endl << - " --onnx= ONNX model" << std::endl << - " " << std::endl << - "=== Optional ===" << std::endl << - " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" << std::endl << - " See --outputIOFormats help for the grammar of type and format list." << std::endl << - " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << - " inputs following the same order as network inputs ID (even if only one input" << std::endl << - " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << - " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" << std::endl << - " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << - " outputs following the same order as network outputs ID (even if only one output" << std::endl << - " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << - " IO Formats: spec ::= IOfmt[\",\"spec]" << std::endl << - " IOfmt ::= type:fmt" << std::endl << - " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" << std::endl << - " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl << - " --int8 Enable int8 precision, in addition to fp16 (default = disabled)" << std::endl << - " --consistency Enable consistency check for serialized engine, (default = disabled)" << std::endl << - " --std Build standard serialized engine, (default = disabled)" << std::endl << - " --calib= Read INT8 calibration cache file" << std::endl << - " --serialized= Save the serialized network" << std::endl << - " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl << - " --verbose or -v Use verbose logging (default = false)" << std::endl << - " --help or -h Print this message" << std::endl << - " " << std::endl; -// clang-format on -} - -} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h deleted file mode 100644 index 8975e1ea..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleOptions.h +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TRT_SAMPLE_OPTIONS_H -#define TRT_SAMPLE_OPTIONS_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "NvInfer.h" - -namespace sample -{ - -// Build default params -constexpr int32_t maxBatchNotProvided{0}; -constexpr int32_t defaultMinTiming{1}; -constexpr int32_t defaultAvgTiming{8}; - -// System default params -constexpr int32_t defaultDevice{0}; - -// Inference default params -constexpr int32_t defaultBatch{1}; -constexpr int32_t batchNotProvided{0}; -constexpr int32_t defaultStreams{1}; -constexpr int32_t defaultIterations{10}; -constexpr float defaultWarmUp{200.F}; -constexpr float defaultDuration{3.F}; -constexpr float defaultSleep{}; -constexpr float defaultIdle{}; - -// Reporting default params -constexpr int32_t defaultAvgRuns{10}; -constexpr float defaultPercentile{99}; - -enum class PrecisionConstraints -{ - kNONE, - kOBEY, - kPREFER -}; - -enum class ModelFormat -{ - kANY, - kCAFFE, - kONNX, - kUFF -}; - -enum class SparsityFlag -{ - kDISABLE, - kENABLE, - kFORCE -}; - -enum class TimingCacheMode -{ - kDISABLE, - kLOCAL, - kGLOBAL -}; - -using Arguments = std::unordered_multimap; - -using IOFormat = std::pair; - -using ShapeRange = std::array, nvinfer1::EnumMax()>; - -using LayerPrecisions = std::unordered_map; -using LayerOutputTypes = std::unordered_map>; - -struct Options -{ - virtual void parse(Arguments& arguments) = 0; -}; - -struct BaseModelOptions : public Options -{ - ModelFormat format{ModelFormat::kANY}; - std::string model; - - void parse(Arguments& arguments) override; - - static void help(std::ostream& out); -}; - -struct UffInput : public Options -{ - std::vector> inputs; - bool NHWC{false}; - - void parse(Arguments& arguments) override; - - static void help(std::ostream& out); -}; - -struct ModelOptions : public Options -{ - BaseModelOptions baseModel; - std::string prototxt; - std::vector outputs; - UffInput uffInputs; - - void parse(Arguments& arguments) override; - - static void help(std::ostream& out); -}; - -struct BuildOptions : public Options -{ - int32_t maxBatch{maxBatchNotProvided}; - double workspace{-1.0}; - double dlaSRAM{-1.0}; - double dlaLocalDRAM{-1.0}; - double dlaGlobalDRAM{-1.0}; - int32_t minTiming{defaultMinTiming}; - int32_t avgTiming{defaultAvgTiming}; - bool tf32{true}; - bool fp16{false}; - bool int8{false}; - bool directIO{false}; - PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE}; - LayerPrecisions layerPrecisions; - LayerOutputTypes layerOutputTypes; - bool safe{false}; - bool consistency{false}; - bool restricted{false}; - bool save{false}; - bool load{false}; - bool refittable{false}; - SparsityFlag sparsity{SparsityFlag::kDISABLE}; -#if (NV_TENSORRT_MAJOR > 7) - nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; -#else - nvinfer1::ProfilingVerbosity profilingVerbosity{ nvinfer1::ProfilingVerbosity::kDEFAULT }; -#endif - std::string engine; - std::string calibration; - std::unordered_map shapes; - std::unordered_map shapesCalib; - std::vector inputFormats; - std::vector outputFormats; - nvinfer1::TacticSources enabledTactics{0}; - nvinfer1::TacticSources disabledTactics{0}; - TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; - std::string timingCacheFile{}; - void parse(Arguments& arguments) override; - - static void help(std::ostream& out); -}; - -struct SystemOptions : public Options -{ - int32_t device{defaultDevice}; - int32_t DLACore{-1}; - bool fallback{false}; - std::vector plugins; - - void parse(Arguments& arguments) override; - - static void help(std::ostream& out); -}; - -struct InferenceOptions : public Options -{ - int32_t batch{batchNotProvided}; - int32_t iterations{defaultIterations}; - int32_t streams{defaultStreams}; - float warmup{defaultWarmUp}; - float duration{defaultDuration}; - float sleep{defaultSleep}; - float idle{defaultIdle}; - bool overlap{true}; - bool skipTransfers{false}; - bool useManaged{false}; - bool spin{false}; - bool threads{false}; - bool graph{false}; - bool skip{false}; - bool rerun{false}; - bool timeDeserialize{false}; - bool timeRefit{false}; - std::unordered_map inputs; - std::unordered_map> shapes; - - void parse(Arguments& arguments) override; - - static void help(std::ostream& out); -}; - -struct ReportingOptions : public Options -{ - bool verbose{false}; - int32_t avgs{defaultAvgRuns}; - float percentile{defaultPercentile}; - bool refit{false}; - bool output{false}; - bool profile{false}; - bool layerInfo{false}; - std::string exportTimes; - std::string exportOutput; - std::string exportProfile; - std::string exportLayerInfo; - - void parse(Arguments& arguments) override; - - static void help(std::ostream& out); -}; - -struct SafeBuilderOptions : public Options -{ - std::string serialized{}; - std::string onnxModelFile{}; - bool help{false}; - bool verbose{false}; - std::vector inputFormats; - std::vector outputFormats; - bool int8{false}; - std::string calibFile{}; - std::vector plugins; - bool consistency{false}; - bool standard{false}; - - void parse(Arguments& arguments) override; - - static void printHelp(std::ostream& out); -}; - -struct AllOptions : public Options -{ - ModelOptions model; - BuildOptions build; - SystemOptions system; - InferenceOptions inference; - ReportingOptions reporting; - bool helps{false}; - - void parse(Arguments& arguments) override; - - static void help(std::ostream& out); -}; - -Arguments argsToArgumentsMap(int32_t argc, char* argv[]); - -bool parseHelp(Arguments& arguments); - -void helpHelp(std::ostream& out); - -// Functions to print options - -std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options); - -std::ostream& operator<<(std::ostream& os, const UffInput& input); - -std::ostream& operator<<(std::ostream& os, const IOFormat& format); - -std::ostream& operator<<(std::ostream& os, const ShapeRange& dims); - -std::ostream& operator<<(std::ostream& os, const ModelOptions& options); - -std::ostream& operator<<(std::ostream& os, const BuildOptions& options); - -std::ostream& operator<<(std::ostream& os, const SystemOptions& options); - -std::ostream& operator<<(std::ostream& os, const InferenceOptions& options); - -std::ostream& operator<<(std::ostream& os, const ReportingOptions& options); - -std::ostream& operator<<(std::ostream& os, const AllOptions& options); - -std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options); - -inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) -{ - for (int32_t i = 0; i < dims.nbDims; ++i) - { - os << (i ? "x" : "") << dims.d[i]; - } - return os; -} -inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole role) -{ - switch (role) - { - case nvinfer1::WeightsRole::kKERNEL: - { - os << "Kernel"; - break; - } - case nvinfer1::WeightsRole::kBIAS: - { - os << "Bias"; - break; - } - case nvinfer1::WeightsRole::kSHIFT: - { - os << "Shift"; - break; - } - case nvinfer1::WeightsRole::kSCALE: - { - os << "Scale"; - break; - } - case nvinfer1::WeightsRole::kCONSTANT: - { - os << "Constant"; - break; - } -#if (NV_TENSORRT_MAJOR > 7) - case nvinfer1::WeightsRole::kANY: - { - os << "Any"; - break; - } -#endif - } - - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const std::vector& vec) -{ - for (int32_t i = 0, e = static_cast(vec.size()); i < e; ++i) - { - os << (i ? "x" : "") << vec[i]; - } - return os; -} - -} // namespace sample - -#endif // TRT_SAMPLES_OPTIONS_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp deleted file mode 100644 index a92938c5..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.cpp +++ /dev/null @@ -1,445 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "sampleInference.h" -#include "sampleOptions.h" -#include "sampleReporting.h" - -namespace sample -{ - -namespace -{ - -//! -//! \brief Find percentile in an ascending sequence of timings -//! \note percentile must be in [0, 100]. Otherwise, an exception is thrown. -//! -template -float findPercentile(float percentile, std::vector const& timings, T const& toFloat) -{ - int32_t const all = static_cast(timings.size()); - int32_t const exclude = static_cast((1 - percentile / 100) * all); - if (timings.empty()) - { - return std::numeric_limits::infinity(); - } - if (percentile < 0.0f || percentile > 100.0f) - { - throw std::runtime_error("percentile is not in [0, 100]!"); - } - return toFloat(timings[std::max(all - 1 - exclude, 0)]); -} - -//! -//! \brief Find median in a sorted sequence of timings -//! -template -float findMedian(std::vector const& timings, T const& toFloat) -{ - if (timings.empty()) - { - return std::numeric_limits::infinity(); - } - - int32_t const m = timings.size() / 2; - if (timings.size() % 2) - { - return toFloat(timings[m]); - } - - return (toFloat(timings[m - 1]) + toFloat(timings[m])) / 2; -} - -//! -//! \brief Find coefficient of variance (which is std / mean) in a sorted sequence of timings given the mean -//! -template -float findCoeffOfVariance(std::vector const& timings, T const& toFloat, float mean) -{ - if (timings.empty()) - { - return 0; - } - - if (mean == 0.F) - { - return std::numeric_limits::infinity(); - } - - auto const metricAccumulator = [toFloat, mean](float acc, InferenceTime const& a) { - float const diff = toFloat(a) - mean; - return acc + diff * diff; - }; - float const variance = std::accumulate(timings.begin(), timings.end(), 0.F, metricAccumulator) / timings.size(); - - return std::sqrt(variance) / mean * 100.F; -} - -inline InferenceTime traceToTiming(const InferenceTrace& a) -{ - return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), - (a.d2hEnd - a.d2hStart), (a.d2hEnd - a.h2dStart)); -} - -} // namespace - -void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTimeMs, std::ostream& os) -{ - os << "Warmup completed " << warmups << " queries over " << warmupMs << " ms" << std::endl; - os << "Timing trace has " << timings << " queries over " << benchTimeMs / 1000 << " s" << std::endl; -} - -void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os) -{ - int32_t count = 0; - InferenceTime sum; - - os << std::endl; - os << "=== Trace details ===" << std::endl; - os << "Trace averages of " << runsPerAvg << " runs:" << std::endl; - for (auto const& t : timings) - { - sum += t; - - if (++count == runsPerAvg) - { - // clang-format off - os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg - << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg - << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl; - // clang-format on - count = 0; - sum.enq = 0; - sum.h2d = 0; - sum.compute = 0; - sum.d2h = 0; - sum.e2e = 0; - } - } -} - -void printMetricExplanations(std::ostream& os) -{ - os << std::endl; - os << "=== Explanations of the performance metrics ===" << std::endl; - os << "Total Host Walltime: the host walltime from when the first query (after warmups) is enqueued to when the " - "last query is completed." - << std::endl; - os << "GPU Compute Time: the GPU latency to execute the kernels for a query." << std::endl; - os << "Total GPU Compute Time: the summation of the GPU Compute Time of all the queries. If this is significantly " - "shorter than Total Host Walltime, the GPU may be under-utilized because of host-side overheads or data " - "transfers." - << std::endl; - os << "Throughput: the observed throughput computed by dividing the number of queries by the Total Host Walltime. " - "If this is significantly lower than the reciprocal of GPU Compute Time, the GPU may be under-utilized " - "because of host-side overheads or data transfers." - << std::endl; - os << "Enqueue Time: the host latency to enqueue a query. If this is longer than GPU Compute Time, the GPU may be " - "under-utilized." - << std::endl; - os << "H2D Latency: the latency for host-to-device data transfers for input tensors of a single query." - << std::endl; - os << "D2H Latency: the latency for device-to-host data transfers for output tensors of a single query." - << std::endl; - os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a " - "single query." - << std::endl; - os << "End-to-End Host Latency: the duration from when the H2D of a query is called to when the D2H of the same " - "query is completed, which includes the latency to wait for the completion of the previous query. This is " - "the latency of a query if multiple queries are enqueued consecutively." - << std::endl; -} - -PerformanceResult getPerformanceResult(std::vector const& timings, - std::function metricGetter, float percentile) -{ - auto const metricComparator - = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); }; - auto const metricAccumulator = [metricGetter](float acc, InferenceTime const& a) { return acc + metricGetter(a); }; - std::vector newTimings = timings; - std::sort(newTimings.begin(), newTimings.end(), metricComparator); - PerformanceResult result; - result.min = metricGetter(newTimings.front()); - result.max = metricGetter(newTimings.back()); - result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size(); - result.median = findMedian(newTimings, metricGetter); - result.percentile = findPercentile(percentile, newTimings, metricGetter); - result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean); - return result; -} - -void printEpilog(std::vector const& timings, float walltimeMs, float percentile, int32_t batchSize, - std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) -{ - float const throughput = batchSize * timings.size() / walltimeMs * 1000; - - auto const getLatency = [](InferenceTime const& t) { return t.latency(); }; - auto const latencyResult = getPerformanceResult(timings, getLatency, percentile); - - auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; }; - auto const e2eLatencyResult = getPerformanceResult(timings, getEndToEnd, percentile); - - auto const getEnqueue = [](InferenceTime const& t) { return t.enq; }; - auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentile); - - auto const getH2d = [](InferenceTime const& t) { return t.h2d; }; - auto const h2dResult = getPerformanceResult(timings, getH2d, percentile); - - auto const getCompute = [](InferenceTime const& t) { return t.compute; }; - auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentile); - - auto const getD2h = [](InferenceTime const& t) { return t.d2h; }; - auto const d2hResult = getPerformanceResult(timings, getD2h, percentile); - - auto const toPerfString = [percentile](const PerformanceResult& r) { - std::stringstream s; - s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, " - << "median = " << r.median << " ms, percentile(" << percentile << "%) = " << r.percentile << " ms"; - return s.str(); - }; - - osInfo << std::endl; - osInfo << "=== Performance summary ===" << std::endl; - osInfo << "Throughput: " << throughput << " qps" << std::endl; - osInfo << "Latency: " << toPerfString(latencyResult) << std::endl; - osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) << std::endl; - osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl; - osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl; - osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl; - osInfo << "D2H Latency: " << toPerfString(d2hResult) << std::endl; - osInfo << "Total Host Walltime: " << walltimeMs / 1000 << " s" << std::endl; - osInfo << "Total GPU Compute Time: " << gpuComputeResult.mean * timings.size() / 1000 << " s" << std::endl; - - // Report warnings if the throughput is bound by other factors than GPU Compute Time. - constexpr float kENQUEUE_BOUND_REPORTING_THRESHOLD{0.8F}; - if (enqueueResult.median > kENQUEUE_BOUND_REPORTING_THRESHOLD * gpuComputeResult.median) - { - osWarning - << "* Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized." - << std::endl; - osWarning << " If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the " - "throughput." - << std::endl; - } - if (h2dResult.median >= gpuComputeResult.median) - { - osWarning << "* Throughput may be bound by host-to-device transfers for the inputs rather than GPU Compute and " - "the GPU may be under-utilized." - << std::endl; - osWarning << " Add --noDataTransfers flag to disable data transfers." << std::endl; - } - if (d2hResult.median >= gpuComputeResult.median) - { - osWarning << "* Throughput may be bound by device-to-host transfers for the outputs rather than GPU Compute " - "and the GPU may be under-utilized." - << std::endl; - osWarning << " Add --noDataTransfers flag to disable data transfers." << std::endl; - } - - // Report warnings if the GPU Compute Time is unstable. - constexpr float kUNSTABLE_PERF_REPORTING_THRESHOLD{1.0F}; - if (gpuComputeResult.coeffVar > kUNSTABLE_PERF_REPORTING_THRESHOLD) - { - osWarning << "* GPU compute time is unstable, with coefficient of variance = " << gpuComputeResult.coeffVar - << "%." << std::endl; - osWarning << " If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the " - << "stability." << std::endl; - } - - // Explain what the metrics mean. - osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl; - printMetricExplanations(osVerbose); - - osInfo << std::endl; -} - -void printPerformanceReport(std::vector const& trace, const ReportingOptions& reporting, float warmupMs, - int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) -{ - auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; }; - auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup); - int32_t const warmups = noWarmup - trace.begin(); - float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart; - // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch - // when explicit batch used, batchSize = options.inference.batch = 0 - // treat inference with explicit batch as a single query and report the throughput - batchSize = batchSize ? batchSize : 1; - printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo); - - std::vector timings(trace.size() - warmups); - std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming); - printTiming(timings, reporting.avgs, osInfo); - printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, osWarning, osVerbose); - - if (!reporting.exportTimes.empty()) - { - exportJSONTrace(trace, reporting.exportTimes); - } -} - -//! Printed format: -//! [ value, ...] -//! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time, -//! "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time, -//! "d2h" : time, "latency" : time, "end to end" : time } -//! -void exportJSONTrace(std::vector const& trace, std::string const& fileName) -{ - std::ofstream os(fileName, std::ofstream::trunc); - os << "[" << std::endl; - char const* sep = " "; - for (auto const& t : trace) - { - InferenceTime const it(traceToTiming(t)); - os << sep << "{ "; - sep = ", "; - // clang-format off - os << "\"startEnqMs\" : " << t.enqStart << sep << "\"endEnqMs\" : " << t.enqEnd << sep - << "\"startH2dMs\" : " << t.h2dStart << sep << "\"endH2dMs\" : " << t.h2dEnd << sep - << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep - << "\"startD2hMs\" : " << t.d2hStart << sep << "\"endD2hMs\" : " << t.d2hEnd << sep - << "\"h2dMs\" : " << it.h2d << sep << "\"computeMs\" : " << it.compute << sep - << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << sep - << "\"endToEndMs\" : " << it.e2e << " }" << std::endl; - // clang-format on - } - os << "]" << std::endl; -} - -void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept -{ - if (mIterator == mLayers.end()) - { - bool const first = !mLayers.empty() && mLayers.begin()->name == layerName; - mUpdatesCount += mLayers.empty() || first; - if (first) - { - mIterator = mLayers.begin(); - } - else - { - mLayers.emplace_back(); - mLayers.back().name = layerName; - mIterator = mLayers.end() - 1; - } - } - - mIterator->timeMs += timeMs; - ++mIterator; -} - -void Profiler::print(std::ostream& os) const noexcept -{ - std::string const nameHdr("Layer"); - std::string const timeHdr(" Time (ms)"); - std::string const avgHdr(" Avg. Time (ms)"); - std::string const percentageHdr(" Time %"); - - float const totalTimeMs = getTotalTime(); - - auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); }; - auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer); - auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size()); - auto const timeLength = timeHdr.size(); - auto const avgLength = avgHdr.size(); - auto const percentageLength = percentageHdr.size(); - - os << std::endl - << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl - << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl; - - for (auto const& p : mLayers) - { - // clang-format off - os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs - << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount - << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100 - << std::endl; - } - { - os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2) - << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount - << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl; - // clang-format on - } - os << std::endl; -} - -void Profiler::exportJSONProfile(std::string const& fileName) const noexcept -{ - std::ofstream os(fileName, std::ofstream::trunc); - os << "[" << std::endl << " { \"count\" : " << mUpdatesCount << " }" << std::endl; - - auto const totalTimeMs = getTotalTime(); - - for (auto const& l : mLayers) - { - // clang-format off - os << ", {" << " \"name\" : \"" << l.name << "\"" - ", \"timeMs\" : " << l.timeMs - << ", \"averageMs\" : " << l.timeMs / mUpdatesCount - << ", \"percentage\" : " << l.timeMs / totalTimeMs * 100 - << " }" << std::endl; - // clang-format on - } - os << "]" << std::endl; -} - -void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) -{ - os << "Input Tensors:" << std::endl; - bindings.dumpInputs(context, os); -} - -void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) -{ - os << "Output Tensors:" << std::endl; - bindings.dumpOutputs(context, os); -} - -void exportJSONOutput( - nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch) -{ - std::ofstream os(fileName, std::ofstream::trunc); - std::string sep = " "; - auto const output = bindings.getOutputBindings(); - os << "[" << std::endl; - for (auto const& binding : output) - { - // clang-format off - os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl; - sep = ", "; - os << " " << sep << "\"dimensions\" : \""; - bindings.dumpBindingDimensions(binding.second, context, os); - os << "\"" << std::endl; - os << " " << sep << "\"values\" : [ "; - bindings.dumpBindingValues(context, binding.second, os, sep, batch); - os << " ]" << std::endl << " }" << std::endl; - // clang-format on - } - os << "]" << std::endl; -} - -} // namespace sample diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h deleted file mode 100644 index 5f730987..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleReporting.h +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TRT_SAMPLE_REPORTING_H -#define TRT_SAMPLE_REPORTING_H - -#include -#include - -#include "NvInfer.h" - -#include "sampleOptions.h" -#include "sampleUtils.h" - -namespace sample -{ - -//! -//! \struct InferenceTime -//! \brief Measurement times in milliseconds -//! -struct InferenceTime -{ - InferenceTime(float q, float i, float c, float o, float e) - : enq(q) - , h2d(i) - , compute(c) - , d2h(o) - , e2e(e) - { - } - - InferenceTime() = default; - InferenceTime(InferenceTime const&) = default; - InferenceTime(InferenceTime&&) = default; - InferenceTime& operator=(InferenceTime const&) = default; - InferenceTime& operator=(InferenceTime&&) = default; - ~InferenceTime() = default; - - float enq{0}; // Enqueue - float h2d{0}; // Host to Device - float compute{0}; // Compute - float d2h{0}; // Device to Host - float e2e{0}; // end to end - - // ideal latency - float latency() const - { - return h2d + compute + d2h; - } -}; - -//! -//! \struct InferenceTrace -//! \brief Measurement points in milliseconds -//! -struct InferenceTrace -{ - InferenceTrace(int32_t s, float es, float ee, float is, float ie, float cs, float ce, float os, float oe) - : stream(s) - , enqStart(es) - , enqEnd(ee) - , h2dStart(is) - , h2dEnd(ie) - , computeStart(cs) - , computeEnd(ce) - , d2hStart(os) - , d2hEnd(oe) - { - } - - InferenceTrace() = default; - InferenceTrace(InferenceTrace const&) = default; - InferenceTrace(InferenceTrace&&) = default; - InferenceTrace& operator=(InferenceTrace const&) = default; - InferenceTrace& operator=(InferenceTrace&&) = default; - ~InferenceTrace() = default; - - int32_t stream{0}; - float enqStart{0}; - float enqEnd{0}; - float h2dStart{0}; - float h2dEnd{0}; - float computeStart{0}; - float computeEnd{0}; - float d2hStart{0}; - float d2hEnd{0}; -}; - -inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b) -{ - return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h, a.e2e + b.e2e); -} - -inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) -{ - return a = a + b; -} - -//! -//! \struct PerformanceResult -//! \brief Performance result of a performance metric -//! -struct PerformanceResult -{ - float min{0}; - float max{0}; - float mean{0}; - float median{0}; - float percentile{0}; - float coeffVar{0}; // coefficient of variation -}; - -//! -//! \brief Print benchmarking time and number of traces collected -//! -void printProlog(int32_t warmups, int32_t timings, float warmupMs, float walltime, std::ostream& os); - -//! -//! \brief Print a timing trace -//! -void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os); - -//! -//! \brief Print the performance summary of a trace -//! -void printEpilog(std::vector const& timings, float percentile, int32_t batchSize, std::ostream& osInfo, - std::ostream& osWarning, std::ostream& osVerbose); - -//! -//! \brief Get the result of a specific performance metric from a trace -//! -PerformanceResult getPerformanceResult(std::vector const& timings, - std::function metricGetter, float percentile); - -//! -//! \brief Print the explanations of the performance metrics printed in printEpilog() function. -//! -void printMetricExplanations(std::ostream& os); - -//! -//! \brief Print and summarize a timing trace -//! -void printPerformanceReport(std::vector const& trace, ReportingOptions const& reporting, float warmupMs, - int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); - -//! -//! \brief Export a timing trace to JSON file -//! -void exportJSONTrace(std::vector const& trace, std::string const& fileName); - -//! -//! \brief Print input tensors to stream -//! -void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); - -//! -//! \brief Print output tensors to stream -//! -void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); - -//! -//! \brief Export output tensors to JSON file -//! -void exportJSONOutput( - nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch); - -//! -//! \struct LayerProfile -//! \brief Layer profile information -//! -struct LayerProfile -{ - std::string name; - float timeMs{0}; -}; - -//! -//! \class Profiler -//! \brief Collect per-layer profile information, assuming times are reported in the same order -//! -class Profiler : public nvinfer1::IProfiler -{ - -public: - void reportLayerTime(char const* layerName, float timeMs) noexcept override; - - void print(std::ostream& os) const noexcept; - - //! - //! \brief Export a profile to JSON file - //! - void exportJSONProfile(std::string const& fileName) const noexcept; - -private: - float getTotalTime() const noexcept - { - auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs; }; - return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime); - } - - std::vector mLayers; - std::vector::iterator mIterator{mLayers.begin()}; - int32_t mUpdatesCount{0}; -}; - -} // namespace sample - -#endif // TRT_SAMPLE_REPORTING_H diff --git a/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h b/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h deleted file mode 100644 index 1509a7fc..00000000 --- a/src/Detector/tensorrt_yolo/common_deprecated/sampleUtils.h +++ /dev/null @@ -1,543 +0,0 @@ -/* - * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TRT_SAMPLE_UTILS_H -#define TRT_SAMPLE_UTILS_H - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "NvInfer.h" - -#include "common.h" -#include "logger.h" -#include "sampleDevice.h" -#include "sampleOptions.h" - -namespace sample -{ - -inline int dataTypeSize(nvinfer1::DataType dataType) -{ - switch (dataType) - { - case nvinfer1::DataType::kINT32: - case nvinfer1::DataType::kFLOAT: return 4; - case nvinfer1::DataType::kHALF: return 2; - case nvinfer1::DataType::kBOOL: - case nvinfer1::DataType::kINT8: return 1; - } - return 0; -} - -template -inline T roundUp(T m, T n) -{ - return ((m + n - 1) / n) * n; -} - -inline int volume(const nvinfer1::Dims& d) -{ - return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); -} - -//! comps is the number of components in a vector. Ignored if vecDim < 0. -inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch) -{ - int maxNbElems = 1; - for (int i = 0; i < dims.nbDims; ++i) - { - // Get effective length of axis. - int d = dims.d[i]; - // Any dimension is 0, it is an empty tensor. - if (d == 0) - { - return 0; - } - if (i == vecDim) - { - d = samplesCommon::divUp(d, comps); - } - maxNbElems = std::max(maxNbElems, d * strides.d[i]); - } - return static_cast(maxNbElems) * batch * (vecDim < 0 ? 1 : comps); -} - -inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch) -{ - if (vecDim != -1) - { - dims.d[vecDim] = roundUp(dims.d[vecDim], comps); - } - return volume(dims) * std::max(batch, 1); -} - -inline nvinfer1::Dims toDims(const std::vector& vec) -{ - int limit = static_cast(nvinfer1::Dims::MAX_DIMS); - if (static_cast(vec.size()) > limit) - { - sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; - } - // Pick first nvinfer1::Dims::MAX_DIMS elements - nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; - std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); - return dims; -} - -template -inline void fillBuffer(void* buffer, int64_t volume, T min, T max) -{ - T* typedBuffer = static_cast(buffer); - std::default_random_engine engine; - if (std::is_integral::value) - { - std::uniform_int_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); - } - else - { - std::uniform_real_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); - } -} - -// Specialization needed for custom type __half -template -inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max) -{ - H* typedBuffer = static_cast(buffer); - std::default_random_engine engine; - std::uniform_real_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; - std::generate(typedBuffer, typedBuffer + volume, generator); -} -template <> -inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max) -{ - fillBufferHalf(buffer, volume, min, max); -} - -template -inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const nvinfer1::Dims& dims, - const nvinfer1::Dims& strides, int32_t vectorDim, int32_t spv) -{ - const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); - const T* typedBuffer = static_cast(buffer); - std::string sep; - for (int64_t v = 0; v < volume; ++v) - { - int64_t curV = v; - int32_t dataOffset = 0; - for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) - { - int32_t dimVal = curV % dims.d[dimIndex]; - if (dimIndex == vectorDim) - { - dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; - } - else - { - dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); - } - curV /= dims.d[dimIndex]; - ASSERT(curV >= 0); - } - - os << sep << typedBuffer[dataOffset]; - sep = separator; - } -} - -inline void loadFromFile(std::string const& fileName, char* dst, size_t size) -{ - ASSERT(dst); - - std::ifstream file(fileName, std::ios::in | std::ios::binary); - if (file.is_open()) - { - file.read(dst, size); - file.close(); - } - else - { - std::stringstream msg; - msg << "Cannot open file " << fileName << "!"; - throw std::invalid_argument(msg.str()); - } -} - -struct Binding -{ - bool isInput{false}; - std::unique_ptr buffer; - int64_t volume{0}; - nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; - - void fill(const std::string& fileName) - { - loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); - } - - void fill() - { - switch (dataType) - { - case nvinfer1::DataType::kBOOL: - { - fillBuffer(buffer->getHostBuffer(), volume, 0, 1); - break; - } - case nvinfer1::DataType::kINT32: - { - fillBuffer(buffer->getHostBuffer(), volume, -128, 127); - break; - } - case nvinfer1::DataType::kINT8: - { - fillBuffer(buffer->getHostBuffer(), volume, -128, 127); - break; - } - case nvinfer1::DataType::kFLOAT: - { - fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); - break; - } - case nvinfer1::DataType::kHALF: - { - fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); - break; - } - } - } - - void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv, - const std::string separator = " ") const - { - switch (dataType) - { - case nvinfer1::DataType::kBOOL: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kINT32: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kINT8: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kFLOAT: - { - dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - case nvinfer1::DataType::kHALF: - { - dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv); - break; - } - } - } -}; - -class Bindings -{ -public: - Bindings() = delete; - explicit Bindings(bool useManaged) - : mUseManaged(useManaged) - { - } - - void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType, - const std::string& fileName = "") - { - while (mBindings.size() <= static_cast(b)) - { - mBindings.emplace_back(); - mDevicePointers.emplace_back(); - } - mNames[name] = b; - if (mBindings[b].buffer == nullptr) - { - if (mUseManaged) - mBindings[b].buffer.reset(new UnifiedMirroredBuffer); - else - mBindings[b].buffer.reset(new DiscreteMirroredBuffer); - } - mBindings[b].isInput = isInput; - // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr - // even for empty tensors, so allocate a dummy byte. - if (volume == 0) - mBindings[b].buffer->allocate(1); - else - mBindings[b].buffer->allocate(static_cast(volume) * static_cast(dataTypeSize(dataType))); - - mBindings[b].volume = volume; - mBindings[b].dataType = dataType; - mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); - if (isInput) - { - if (fileName.empty()) - fill(b); - else - fill(b, fileName); - } - } - - void** getDeviceBuffers() - { - return mDevicePointers.data(); - } - - void transferInputToDevice(TrtCudaStream& stream) - { - for (auto& b : mNames) - { - if (mBindings[b.second].isInput) - mBindings[b.second].buffer->hostToDevice(stream); - } - } - - void transferOutputToHost(TrtCudaStream& stream) - { - for (auto& b : mNames) - { - if (!mBindings[b.second].isInput) - mBindings[b.second].buffer->deviceToHost(stream); - } - } - - void fill(int binding, const std::string& fileName) - { - mBindings[binding].fill(fileName); - } - - void fill(int binding) - { - mBindings[binding].fill(); - } - - void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - const auto dims = context.getBindingDimensions(binding); - // Do not add a newline terminator, because the caller may be outputting a JSON string. - os << dims; - } - - void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os, - const std::string& separator = " ", int32_t batch = 1) const - { - nvinfer1::Dims dims = context.getBindingDimensions(binding); - nvinfer1::Dims strides = context.getStrides(binding); - int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding); - const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding); - - if (context.getEngine().hasImplicitBatchDimension()) - { - auto insertN = [](nvinfer1::Dims& d, int32_t bs) { - const int32_t nbDims = d.nbDims; - ASSERT(nbDims < nvinfer1::Dims::MAX_DIMS); - std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]); - d.d[0] = bs; - d.nbDims = nbDims + 1; - }; - int32_t batchStride = 0; - for (int32_t i = 0; i < strides.nbDims; ++i) - { - if (strides.d[i] * dims.d[i] > batchStride) - { - batchStride = strides.d[i] * dims.d[i]; - } - } - insertN(dims, batch); - insertN(strides, batchStride); - vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1; - } - - mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); - } - - void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto isInput = [](const Binding& b) { return b.isInput; }; - dumpBindings(context, isInput, os); - } - - void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto isOutput = [](const Binding& b) { return !b.isInput; }; - dumpBindings(context, isOutput, os); - } - - void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const - { - auto all = [](const Binding& /*b*/) { return true; }; - dumpBindings(context, all, os); - } - - void dumpBindings( - const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const - { - for (const auto& n : mNames) - { - const auto binding = n.second; - if (predicate(mBindings[binding])) - { - os << n.first << ": ("; - dumpBindingDimensions(binding, context, os); - os << ")" << std::endl; - - dumpBindingValues(context, binding, os); - os << std::endl; - } - } - } - - std::unordered_map getInputBindings() const - { - auto isInput = [](const Binding& b) { return b.isInput; }; - return getBindings(isInput); - } - - std::unordered_map getOutputBindings() const - { - auto isOutput = [](const Binding& b) { return !b.isInput; }; - return getBindings(isOutput); - } - - std::unordered_map getBindings() const - { - auto all = [](const Binding& /*b*/) { return true; }; - return getBindings(all); - } - - std::unordered_map getBindings(bool (*predicate)(const Binding& b)) const - { - std::unordered_map bindings; - for (const auto& n : mNames) - { - const auto binding = n.second; - if (predicate(mBindings[binding])) - bindings.insert(n); - } - return bindings; - } - -private: - std::unordered_map mNames; - std::vector mBindings; - std::vector mDevicePointers; - bool mUseManaged{false}; -}; - -template -struct TrtDestroyer -{ - void operator()(T* t) - { - //t->destroy(); - delete t; - } -}; - -template -using TrtUniquePtr = std::unique_ptr>; - -inline bool broadcastIOFormats(const std::vector& formats, size_t nbBindings, bool isInput = true) -{ - bool broadcast = formats.size() == 1; - bool validFormatsCount = broadcast || (formats.size() == nbBindings); - if (!formats.empty() && !validFormatsCount) - { - if (isInput) - { - throw std::invalid_argument( - "The number of inputIOFormats must match network's inputs or be one for broadcasting."); - } - else - { - throw std::invalid_argument( - "The number of outputIOFormats must match network's outputs or be one for broadcasting."); - } - } - return broadcast; -} - -inline std::vector loadTimingCacheFile(const std::string inFileName) -{ - std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); - if (!iFile) - { - sample::gLogWarning << "Could not read timing cache from: " << inFileName - << ". A new timing cache will be generated and written." << std::endl; - return std::vector(); - } - iFile.seekg(0, std::ifstream::end); - size_t fsize = iFile.tellg(); - iFile.seekg(0, std::ifstream::beg); - std::vector content(fsize); - iFile.read(content.data(), fsize); - iFile.close(); - sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl; - return content; -} - -inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) -{ - std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); - if (!oFile) - { - sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl; - return; - } - oFile.write((char*) blob->data(), blob->size()); - oFile.close(); - sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl; -} - -inline int32_t getCudaDriverVersion() -{ - int32_t version{-1}; - cudaCheck(cudaDriverGetVersion(&version)); - return version; -} - -inline int32_t getCudaRuntimeVersion() -{ - int32_t version{-1}; - cudaCheck(cudaRuntimeGetVersion(&version)); - return version; -} - -} // namespace sample - -#endif // TRT_SAMPLE_UTILS_H diff --git a/src/Detector/tensorrt_yolo/ds_image.cpp b/src/Detector/tensorrt_yolo/ds_image.cpp index b801b874..77404f97 100644 --- a/src/Detector/tensorrt_yolo/ds_image.cpp +++ b/src/Detector/tensorrt_yolo/ds_image.cpp @@ -50,7 +50,8 @@ DsImage::DsImage(const cv::Mat& mat_image_, tensor_rt::ModelType net_type, const if (tensor_rt::ModelType::YOLOV5 == net_type || tensor_rt::ModelType::YOLOV6 == net_type || tensor_rt::ModelType::YOLOV7 == net_type || tensor_rt::ModelType::YOLOV7Mask == net_type || tensor_rt::ModelType::YOLOV8 == net_type || tensor_rt::ModelType::YOLOV8_OBB == net_type || tensor_rt::ModelType::YOLOV8Mask == net_type || - tensor_rt::ModelType::YOLOV9 == net_type || tensor_rt::ModelType::YOLOV10 == net_type) + tensor_rt::ModelType::YOLOV9 == net_type || tensor_rt::ModelType::YOLOV10 == net_type || + tensor_rt::ModelType::YOLOV11 == net_type || tensor_rt::ModelType::YOLOV11_OBB == net_type || tensor_rt::ModelType::YOLOV11Mask == net_type) { // resize the DsImage with scale float r = std::min(static_cast(inputH) / static_cast(m_Height), static_cast(inputW) / static_cast(m_Width)); @@ -101,7 +102,8 @@ DsImage::DsImage(const std::string& path, tensor_rt::ModelType net_type, const i if (tensor_rt::ModelType::YOLOV5 == net_type || tensor_rt::ModelType::YOLOV6 == net_type || tensor_rt::ModelType::YOLOV7 == net_type || tensor_rt::ModelType::YOLOV7Mask == net_type || tensor_rt::ModelType::YOLOV8 == net_type || tensor_rt::ModelType::YOLOV8_OBB == net_type || tensor_rt::ModelType::YOLOV8Mask == net_type || - tensor_rt::ModelType::YOLOV9 == net_type || tensor_rt::ModelType::YOLOV10 == net_type) + tensor_rt::ModelType::YOLOV9 == net_type || tensor_rt::ModelType::YOLOV10 == net_type || + tensor_rt::ModelType::YOLOV11 == net_type || tensor_rt::ModelType::YOLOV11_OBB == net_type || tensor_rt::ModelType::YOLOV11Mask == net_type) { // resize the DsImage with scale float dim = std::max(m_Height, m_Width); From 87682db7a68ae76a6f1e4443670b7ff17fe03a82 Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Thu, 3 Oct 2024 00:11:34 +0300 Subject: [PATCH 3/3] Fix output layer index --- src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp index ea6ea2a2..54fc6b01 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp @@ -20,8 +20,8 @@ class YOLOv11_instance_onnx : public YoloONNX const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); - size_t outInd = (outputs.size() == 0) ? 0 : 1; - size_t segInd = (outputs.size() == 0) ? 1 : 0; + size_t outInd = (outputs.size() == 0) ? 1 : 0; + size_t segInd = (outputs.size() == 0) ? 0 : 1; auto output = outputs[0]; @@ -39,8 +39,8 @@ class YOLOv11_instance_onnx : public YoloONNX //std::cout << ";" << std::endl; //0: name: images, size: 1x3x640x640 - //1: name: output1, size: 1x32x160x160 - //2: name: output0, size: 1x116x8400 + //1: name: output0, size: 1x116x8400 + //2: name: output1, size: 1x32x160x160 // 25200 = 3x80x80 + 3x40x40 + 3x20x20 // 116 = x, y, w, h, 80 classes, 32 seg ancors // 80 * 8 = 640, 40 * 16 = 640, 20 * 32 = 640