diff --git a/README.md b/README.md
index a801234c..ba2cdfb0 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,10 @@
 
 # Last changes
 
+* TensorRT 10 is supported
+
+* YOLOv11, YOLOv11-obb and YOLOv11-seg detector worked with TensorRT! Export pretrained Pytorch models [here (ultralytics/ultralytics)](https://github.com/ultralytics/ultralytics) to onnx format and run Multitarget-tracker with -e=6 example
+
 * YOLOv8-obb detector worked with TensorRT! Export pretrained Pytorch models [here (ultralytics/ultralytics)](https://github.com/ultralytics/ultralytics) to onnx format and run Multitarget-tracker with -e=6 example
 
 * YOLOv10 detector worked with TensorRT! Export pretrained Pytorch models [here (THU-MIG/yolov10)](https://github.com/THU-MIG/yolov10) to onnx format and run Multitarget-tracker with -e=6 example
diff --git a/data/settings_yolov11.ini b/data/settings_yolov11.ini
new file mode 100644
index 00000000..c82412cd
--- /dev/null
+++ b/data/settings_yolov11.ini
@@ -0,0 +1,142 @@
+[detection]
+
+#-----------------------------
+# opencv_dnn = 12
+# darknet_cudnn = 10
+# tensorrt = 11
+detector_backend = 12
+
+#-----------------------------
+# Target and backend for opencv_dnn detector
+# DNN_TARGET_CPU
+# DNN_TARGET_OPENCL
+# DNN_TARGET_OPENCL_FP16
+# DNN_TARGET_MYRIAD
+# DNN_TARGET_CUDA
+# DNN_TARGET_CUDA_FP16
+ocv_dnn_target = DNN_TARGET_CPU
+
+# DNN_BACKEND_DEFAULT
+# DNN_BACKEND_HALIDE
+# DNN_BACKEND_INFERENCE_ENGINE
+# DNN_BACKEND_OPENCV
+# DNN_BACKEND_VKCOM
+# DNN_BACKEND_CUDA
+# DNN_BACKEND_INFERENCE_ENGINE_NGRAPH
+# DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019
+ocv_dnn_backend = DNN_BACKEND_OPENCV
+
+#-----------------------------
+nn_weights = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s.onnx
+nn_config = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s.onnx
+class_names = C:/work/home/mtracker/Multitarget-tracker/data/coco/coco.names
+
+#-----------------------------
+confidence_threshold = 0.3
+	
+max_crop_ratio = 0
+max_batch = 1
+gpu_id = 0
+
+#-----------------------------
+# YOLOV3 
+# YOLOV4 
+# YOLOV5 
+net_type = YOLOV11
+
+#-----------------------------
+# INT8
+# FP16
+# FP32
+inference_precision = FP16
+
+
+[tracking]
+
+#-----------------------------
+# DistCenters = 0   // Euclidean distance between centers, pixels
+# DistRects = 1     // Euclidean distance between bounding rectangles, pixels
+# DistJaccard = 2   // Intersection over Union, IoU, [0, 1]
+# DistHist = 3      // Bhatacharia distance between histograms, [0, 1]
+
+distance_type = 0
+
+#-----------------------------
+# KalmanLinear = 0
+# KalmanUnscented = 1
+
+kalman_type = 0
+
+#-----------------------------
+# FilterCenter = 0
+# FilterRect = 1
+# FilterRRect = 2
+
+filter_goal = 0
+
+#-----------------------------
+# TrackNone = 0
+# TrackKCF = 1
+# TrackMIL = 2
+# TrackMedianFlow = 3
+# TrackGOTURN = 4
+# TrackMOSSE = 5
+# TrackCSRT = 6
+# TrackDAT = 7
+# TrackSTAPLE = 8
+# TrackLDES = 9
+# TrackDaSiamRPN = 10
+# Used if filter_goal == FilterRect
+
+lost_track_type = 0
+
+#-----------------------------
+# MatchHungrian = 0
+# MatchBipart = 1
+
+match_type = 0
+
+#-----------------------------
+# Use constant acceleration motion model:
+# 0 - unused (stable)
+# 1 - use acceleration in Kalman filter (experimental)
+use_aceleration = 0
+
+#-----------------------------
+# Delta time for Kalman filter
+delta_time = 0.4
+
+#-----------------------------
+# Accel noise magnitude for Kalman filter
+accel_noise = 0.2
+
+#-----------------------------
+# Distance threshold between region and object on two frames
+dist_thresh = 0.8 
+
+#-----------------------------
+# If this value > 0 than will be used circle with this radius
+# If this value <= 0 than will be used ellipse with size (3*vx, 3*vy), vx and vy - horizontal and vertical speed in pixelsa
+min_area_radius_pix = -1
+
+#-----------------------------
+# Minimal area radius in ration for object size. Used if min_area_radius_pix < 0
+min_area_radius_k = 0.8
+
+#-----------------------------
+# If the object do not assignment more than this frames then it will be removed
+max_skip_frames = 50
+
+#-----------------------------
+# The maximum trajectory length
+max_trace_len = 50
+
+#-----------------------------
+# Detection abandoned objects
+detect_abandoned = 0
+# After this time (in seconds) the object is considered abandoned
+min_static_time = 5
+# After this time (in seconds) the abandoned object will be removed
+max_static_time = 25
+# Speed in pixels. If speed of object is more that this value than object is non static
+max_speed_for_static = 10
diff --git a/data/settings_yolov11_obb.ini b/data/settings_yolov11_obb.ini
new file mode 100644
index 00000000..599e5dd5
--- /dev/null
+++ b/data/settings_yolov11_obb.ini
@@ -0,0 +1,142 @@
+[detection]
+
+#-----------------------------
+# opencv_dnn = 12
+# darknet_cudnn = 10
+# tensorrt = 11
+detector_backend = 12
+
+#-----------------------------
+# Target and backend for opencv_dnn detector
+# DNN_TARGET_CPU
+# DNN_TARGET_OPENCL
+# DNN_TARGET_OPENCL_FP16
+# DNN_TARGET_MYRIAD
+# DNN_TARGET_CUDA
+# DNN_TARGET_CUDA_FP16
+ocv_dnn_target = DNN_TARGET_CPU
+
+# DNN_BACKEND_DEFAULT
+# DNN_BACKEND_HALIDE
+# DNN_BACKEND_INFERENCE_ENGINE
+# DNN_BACKEND_OPENCV
+# DNN_BACKEND_VKCOM
+# DNN_BACKEND_CUDA
+# DNN_BACKEND_INFERENCE_ENGINE_NGRAPH
+# DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019
+ocv_dnn_backend = DNN_BACKEND_OPENCV
+
+#-----------------------------
+nn_weights = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s-obb.onnx
+nn_config = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s-obb.onnx
+class_names = C:/work/home/mtracker/Multitarget-tracker/data/DOTA.names
+
+#-----------------------------
+confidence_threshold = 0.3
+	
+max_crop_ratio = 0
+max_batch = 1
+gpu_id = 0
+
+#-----------------------------
+# YOLOV3 
+# YOLOV4 
+# YOLOV5 
+net_type = YOLOV11_OBB
+
+#-----------------------------
+# INT8
+# FP16
+# FP32
+inference_precision = FP16
+
+
+[tracking]
+
+#-----------------------------
+# DistCenters = 0   // Euclidean distance between centers, pixels
+# DistRects = 1     // Euclidean distance between bounding rectangles, pixels
+# DistJaccard = 2   // Intersection over Union, IoU, [0, 1]
+# DistHist = 3      // Bhatacharia distance between histograms, [0, 1]
+
+distance_type = 0
+
+#-----------------------------
+# KalmanLinear = 0
+# KalmanUnscented = 1
+
+kalman_type = 0
+
+#-----------------------------
+# FilterCenter = 0
+# FilterRect = 1
+# FilterRRect = 2
+
+filter_goal = 0
+
+#-----------------------------
+# TrackNone = 0
+# TrackKCF = 1
+# TrackMIL = 2
+# TrackMedianFlow = 3
+# TrackGOTURN = 4
+# TrackMOSSE = 5
+# TrackCSRT = 6
+# TrackDAT = 7
+# TrackSTAPLE = 8
+# TrackLDES = 9
+# TrackDaSiamRPN = 10
+# Used if filter_goal == FilterRect
+
+lost_track_type = 0
+
+#-----------------------------
+# MatchHungrian = 0
+# MatchBipart = 1
+
+match_type = 0
+
+#-----------------------------
+# Use constant acceleration motion model:
+# 0 - unused (stable)
+# 1 - use acceleration in Kalman filter (experimental)
+use_aceleration = 0
+
+#-----------------------------
+# Delta time for Kalman filter
+delta_time = 0.4
+
+#-----------------------------
+# Accel noise magnitude for Kalman filter
+accel_noise = 0.2
+
+#-----------------------------
+# Distance threshold between region and object on two frames
+dist_thresh = 0.8 
+
+#-----------------------------
+# If this value > 0 than will be used circle with this radius
+# If this value <= 0 than will be used ellipse with size (3*vx, 3*vy), vx and vy - horizontal and vertical speed in pixelsa
+min_area_radius_pix = -1
+
+#-----------------------------
+# Minimal area radius in ration for object size. Used if min_area_radius_pix < 0
+min_area_radius_k = 0.8
+
+#-----------------------------
+# If the object do not assignment more than this frames then it will be removed
+max_skip_frames = 50
+
+#-----------------------------
+# The maximum trajectory length
+max_trace_len = 50
+
+#-----------------------------
+# Detection abandoned objects
+detect_abandoned = 0
+# After this time (in seconds) the object is considered abandoned
+min_static_time = 5
+# After this time (in seconds) the abandoned object will be removed
+max_static_time = 25
+# Speed in pixels. If speed of object is more that this value than object is non static
+max_speed_for_static = 10
diff --git a/data/settings_yolov11_seg.ini b/data/settings_yolov11_seg.ini
new file mode 100644
index 00000000..cb5c83ea
--- /dev/null
+++ b/data/settings_yolov11_seg.ini
@@ -0,0 +1,142 @@
+[detection]
+
+#-----------------------------
+# opencv_dnn = 12
+# darknet_cudnn = 10
+# tensorrt = 11
+detector_backend = 12
+
+#-----------------------------
+# Target and backend for opencv_dnn detector
+# DNN_TARGET_CPU
+# DNN_TARGET_OPENCL
+# DNN_TARGET_OPENCL_FP16
+# DNN_TARGET_MYRIAD
+# DNN_TARGET_CUDA
+# DNN_TARGET_CUDA_FP16
+ocv_dnn_target = DNN_TARGET_CPU
+
+# DNN_BACKEND_DEFAULT
+# DNN_BACKEND_HALIDE
+# DNN_BACKEND_INFERENCE_ENGINE
+# DNN_BACKEND_OPENCV
+# DNN_BACKEND_VKCOM
+# DNN_BACKEND_CUDA
+# DNN_BACKEND_INFERENCE_ENGINE_NGRAPH
+# DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019
+ocv_dnn_backend = DNN_BACKEND_OPENCV
+
+#-----------------------------
+nn_weights = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s-seg.onnx
+nn_config = C:/work/home/mtracker/Multitarget-tracker/data/coco/yolo11s-seg.onnx
+class_names = C:/work/home/mtracker/Multitarget-tracker/data/coco/coco.names
+
+#-----------------------------
+confidence_threshold = 0.3
+	
+max_crop_ratio = 0
+max_batch = 1
+gpu_id = 0
+
+#-----------------------------
+# YOLOV3 
+# YOLOV4 
+# YOLOV5 
+net_type = YOLOV11Mask
+
+#-----------------------------
+# INT8
+# FP16
+# FP32
+inference_precision = FP16
+
+
+[tracking]
+
+#-----------------------------
+# DistCenters = 0   // Euclidean distance between centers, pixels
+# DistRects = 1     // Euclidean distance between bounding rectangles, pixels
+# DistJaccard = 2   // Intersection over Union, IoU, [0, 1]
+# DistHist = 3      // Bhatacharia distance between histograms, [0, 1]
+
+distance_type = 0
+
+#-----------------------------
+# KalmanLinear = 0
+# KalmanUnscented = 1
+
+kalman_type = 0
+
+#-----------------------------
+# FilterCenter = 0
+# FilterRect = 1
+# FilterRRect = 2
+
+filter_goal = 0
+
+#-----------------------------
+# TrackNone = 0
+# TrackKCF = 1
+# TrackMIL = 2
+# TrackMedianFlow = 3
+# TrackGOTURN = 4
+# TrackMOSSE = 5
+# TrackCSRT = 6
+# TrackDAT = 7
+# TrackSTAPLE = 8
+# TrackLDES = 9
+# TrackDaSiamRPN = 10
+# Used if filter_goal == FilterRect
+
+lost_track_type = 0
+
+#-----------------------------
+# MatchHungrian = 0
+# MatchBipart = 1
+
+match_type = 0
+
+#-----------------------------
+# Use constant acceleration motion model:
+# 0 - unused (stable)
+# 1 - use acceleration in Kalman filter (experimental)
+use_aceleration = 0
+
+#-----------------------------
+# Delta time for Kalman filter
+delta_time = 0.4
+
+#-----------------------------
+# Accel noise magnitude for Kalman filter
+accel_noise = 0.2
+
+#-----------------------------
+# Distance threshold between region and object on two frames
+dist_thresh = 0.8 
+
+#-----------------------------
+# If this value > 0 than will be used circle with this radius
+# If this value <= 0 than will be used ellipse with size (3*vx, 3*vy), vx and vy - horizontal and vertical speed in pixelsa
+min_area_radius_pix = -1
+
+#-----------------------------
+# Minimal area radius in ration for object size. Used if min_area_radius_pix < 0
+min_area_radius_k = 0.8
+
+#-----------------------------
+# If the object do not assignment more than this frames then it will be removed
+max_skip_frames = 50
+
+#-----------------------------
+# The maximum trajectory length
+max_trace_len = 50
+
+#-----------------------------
+# Detection abandoned objects
+detect_abandoned = 0
+# After this time (in seconds) the object is considered abandoned
+min_static_time = 5
+# After this time (in seconds) the abandoned object will be removed
+max_static_time = 25
+# Speed in pixels. If speed of object is more that this value than object is non static
+max_speed_for_static = 10
diff --git a/example/examples.h b/example/examples.h
index 1be76399..08b0fc67 100644
--- a/example/examples.h
+++ b/example/examples.h
@@ -652,7 +652,10 @@ class YoloTensorRTExample final : public VideoExample
 				YOLOV8_OBB,
                 YOLOv8Mask,
 				YOLOv9,
-				YOLOv10
+				YOLOv10,
+				YOLOv11,
+				YOLOv11_OBB,
+				YOLOv11Mask
             };
             YOLOModels usedModel = YOLOModels::YOLOv9;
             switch (usedModel)
diff --git a/src/Detector/OCVDNNDetector.cpp b/src/Detector/OCVDNNDetector.cpp
index 01d1102f..3da65967 100644
--- a/src/Detector/OCVDNNDetector.cpp
+++ b/src/Detector/OCVDNNDetector.cpp
@@ -142,6 +142,9 @@ bool OCVDNNDetector::Init(const config_t& config)
         dictNetType["YOLOV8Mask"] = ModelType::YOLOV8Mask;
         dictNetType["YOLOV9"] = ModelType::YOLOV9;
         dictNetType["YOLOV10"] = ModelType::YOLOV10;
+        dictNetType["YOLOV11"] = ModelType::YOLOV11;
+        dictNetType["YOLOV11_OBB"] = ModelType::YOLOV11_OBB;
+        dictNetType["YOLOV11Mask"] = ModelType::YOLOV11Mask;
 
         auto netType = dictNetType.find(net_type->second);
         if (netType != dictNetType.end())
@@ -348,7 +351,7 @@ void OCVDNNDetector::DetectInCrop(const cv::UMat& colorFrame, const cv::Rect& cr
     }
 	else
 	{
-        if (m_netType == ModelType::YOLOV8 || m_netType == ModelType::YOLOV5 || m_netType == ModelType::YOLOV9 || m_netType == ModelType::YOLOV10)
+        if (m_netType == ModelType::YOLOV8 || m_netType == ModelType::YOLOV5 || m_netType == ModelType::YOLOV9 || m_netType == ModelType::YOLOV10 || m_netType == ModelType::YOLOV11)
         {
             int rows = detections[0].size[1];
             int dimensions = detections[0].size[2];
@@ -370,7 +373,7 @@ void OCVDNNDetector::DetectInCrop(const cv::UMat& colorFrame, const cv::Rect& cr
 
             for (int i = 0; i < rows; ++i)
             {
-                if (m_netType == ModelType::YOLOV8 || m_netType == ModelType::YOLOV9)
+                if (m_netType == ModelType::YOLOV8 || m_netType == ModelType::YOLOV9 || m_netType == ModelType::YOLOV11)
                 {
                     float* classes_scores = data + 4;
 
diff --git a/src/Detector/OCVDNNDetector.h b/src/Detector/OCVDNNDetector.h
index 79842ba2..44d91b4d 100644
--- a/src/Detector/OCVDNNDetector.h
+++ b/src/Detector/OCVDNNDetector.h
@@ -42,7 +42,10 @@ class OCVDNNDetector final : public BaseDetector
         YOLOV8_OBB,
         YOLOV8Mask,
         YOLOV9,
-        YOLOV10
+        YOLOV10,
+        YOLOV11,
+        YOLOV11_OBB,
+        YOLOV11Mask
     };
 
     cv::dnn::Net m_net;
diff --git a/src/Detector/YoloTensorRTDetector.cpp b/src/Detector/YoloTensorRTDetector.cpp
index a0ebeb44..d1cfb352 100644
--- a/src/Detector/YoloTensorRTDetector.cpp
+++ b/src/Detector/YoloTensorRTDetector.cpp
@@ -107,6 +107,9 @@ bool YoloTensorRTDetector::Init(const config_t& config)
 		dictNetType["YOLOV8Mask"] = tensor_rt::YOLOV8Mask;
 		dictNetType["YOLOV9"] = tensor_rt::YOLOV9;
 		dictNetType["YOLOV10"] = tensor_rt::YOLOV10;
+		dictNetType["YOLOV11"] = tensor_rt::YOLOV11;
+		dictNetType["YOLOV11_OBB"] = tensor_rt::YOLOV11_OBB;
+		dictNetType["YOLOV11Mask"] = tensor_rt::YOLOV11Mask;
 
 		auto netType = dictNetType.find(net_type->second);
 		if (netType != dictNetType.end())
@@ -298,7 +301,7 @@ void YoloTensorRTDetector::Detect(const std::vector<cv::UMat>& frames, std::vect
 ///
 void YoloTensorRTDetector::CalcMotionMap(cv::Mat& frame)
 {
-	if (m_localConfig.net_type == tensor_rt::YOLOV7Mask || m_localConfig.net_type == tensor_rt::YOLOV8Mask)
+	if (m_localConfig.net_type == tensor_rt::YOLOV7Mask || m_localConfig.net_type == tensor_rt::YOLOV8Mask || m_localConfig.net_type == tensor_rt::YOLOV11Mask)
 	{
 		static std::vector<cv::Scalar> color;
 		if (color.empty())
diff --git a/src/Detector/tensorrt_yolo/CMakeLists.txt b/src/Detector/tensorrt_yolo/CMakeLists.txt
index 30509d0e..d09a2243 100644
--- a/src/Detector/tensorrt_yolo/CMakeLists.txt
+++ b/src/Detector/tensorrt_yolo/CMakeLists.txt
@@ -43,7 +43,7 @@ SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 find_package(CUDNN REQUIRED)
 find_package(TensorRT REQUIRED)
 
-message("TensorRT major version: " ${TensorRT_VERSION_MAJOR})
+message("TensorRT version: " ${TensorRT_VERSION})
 
 include_directories(${OpenCV_INCLUDE_DIRS})
 include_directories(${CUDA_INCLUDE_DIRS})
@@ -58,13 +58,17 @@ file(GLOB TENSORRT_CUDA_FILES *.cu)
 cuda_add_library(${libname_rt} SHARED
     ${TENSORRT_CUDA_FILES}
     ${TENSORRT_SOURCE_FILES}
-    ${TENSORRT_HEADER_FILES}
-)
+    ${TENSORRT_HEADER_FILES})
 
 #message("TensorRT OpenCV libraries:")
 #message("${OpenCV_LIBS}")
 #message(${OpenCV_DIR})
 
+set(TensorRT_LIBRARIES ${TensorRT_LIBRARIES} ${TensorRT_nvinfer_LIBRARY} ${TensorRT_nvinfer_plugin_LIBRARY} ${TensorRT_nvonnxparser_LIBRARY})
+
+message("TensorRT_LIBRARIES: ${TensorRT_LIBRARIES}")
+
+
 set(TENSORRT_LIBS
     ${OpenCV_LIBS}
     #${CUDA_LIBRARIES}
@@ -74,13 +78,14 @@ set(TENSORRT_LIBS
     ${CUDA_curand_LIBRARY}
     ${CUDNN_LIBRARY}
     # ${LIB_PTHREAD}
-    ${TensorRT_LIBRARIES}
-)
+    ${TensorRT_LIBRARIES})
 
 if (CMAKE_COMPILER_IS_GNUCXX)
-    set(TENSORRT_LIBS ${TENSORRT_LIBS} stdc++fs nvinfer_plugin nvonnxparser)
+    set(TENSORRT_LIBS ${TENSORRT_LIBS} stdc++fs)
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
+message("TENSORRT_LIBS: ${TENSORRT_LIBS}")
+
 target_link_libraries(${libname_rt} ${TENSORRT_LIBS})
 
 install(TARGETS ${libname_rt}
@@ -90,4 +95,4 @@ install(TARGETS ${libname_rt}
         RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
         PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/include/${PROJECT_NAME})
 
-set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER "libs")
\ No newline at end of file
+set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER "libs")
diff --git a/src/Detector/tensorrt_yolo/YoloONNX.cpp b/src/Detector/tensorrt_yolo/YoloONNX.cpp
index b016c4b3..3ea99ec4 100644
--- a/src/Detector/tensorrt_yolo/YoloONNX.cpp
+++ b/src/Detector/tensorrt_yolo/YoloONNX.cpp
@@ -1,5 +1,7 @@
 #include <chrono>
 
+#define DEFINE_TRT_ENTRYPOINTS 1
+
 #include "YoloONNX.hpp"
 #include "trt_utils.h"
 #include "../../common/defines.h"
@@ -22,14 +24,13 @@ bool YoloONNX::Init(const SampleYoloParams& params)
 
     auto GetBindings = [&]()
     {
-        auto numBindings = m_engine->getNbBindings();
+        auto numBindings = m_engine->getNbIOTensors();
 
         std::cout << "** Bindings: " << numBindings << " **" << std::endl;
         for (int32_t i = 0; i < numBindings; ++i)
         {
-            nvinfer1::Dims dim = m_engine->getBindingDimensions(i);
-
-            std::string bindName = m_engine->getBindingName(i);
+            std::string bindName = m_engine->getIOTensorName(i);
+            nvinfer1::Dims dim = m_engine->getTensorShape(bindName.c_str());
             for (const auto& outName : m_params.outputTensorNames)
             {
                 if (bindName == outName)
@@ -77,27 +78,17 @@ bool YoloONNX::Init(const SampleYoloParams& params)
         delete infer;
 #endif
 
-        sample::gLogInfo << "TRT Engine loaded from: " << m_params.engineFileName << std::endl;
-
-        GetBindings();
-
-        if (!m_engine)
+        if (m_engine)
         {
-            res = false;
+            GetBindings();
+            m_inputDims = m_engine->getTensorShape(m_engine->getIOTensorName(0));
+            res = true;
         }
         else
         {
-#if 1
-            m_inputDims = m_engine->getBindingDimensions(0);
-#else
-            m_inputDims.nbDims = 4;
-            m_inputDims.d[0] = m_params.explicitBatchSize;
-            m_inputDims.d[1] = 3;
-            m_inputDims.d[2] = m_params.width;
-            m_inputDims.d[3] = m_params.height;
-#endif
             res = true;
         }
+        sample::gLogInfo << "TRT Engine loaded from: " << m_params.engineFileName << " with res = " << res << std::endl;
     }
     else
     {
@@ -175,9 +166,9 @@ bool YoloONNX::ConstructNetwork(YoloONNXUniquePtr<nvinfer1::IBuilder>& builder,
     size_t dlaManagedSRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM);
     size_t dlaLocalDRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM);
     size_t dlaGlobalDRAMSize = config->getMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM);
-	std::cout << "workspaceSize = " << workspaceSize << ", dlaManagedSRAMSize = " << dlaManagedSRAMSize << ", dlaLocalDRAMSize = " << dlaLocalDRAMSize << ", dlaGlobalDRAMSize = " << dlaGlobalDRAMSize << std::endl;
+	std::cout << "m_params.videoMemory = " << m_params.videoMemory << ", workspaceSize = " << workspaceSize << ", dlaManagedSRAMSize = " << dlaManagedSRAMSize << ", dlaLocalDRAMSize = " << dlaLocalDRAMSize << ", dlaGlobalDRAMSize = " << dlaGlobalDRAMSize << std::endl;
 
-    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : 4096_MiB);
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, m_params.videoMemory ? m_params.videoMemory : workspaceSize);
 #endif
 
     config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp
new file mode 100644
index 00000000..9103bfa6
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp
@@ -0,0 +1,111 @@
+#pragma once
+
+#include "YoloONNX.hpp"
+
+///
+/// \brief The YOLOv11_bb_onnx class
+///
+class YOLOv11_bb_onnx : public YoloONNX
+{
+protected:
+	///
+	/// \brief GetResult
+	/// \param output
+	/// \return
+	///
+	std::vector<tensor_rt::Result> GetResult(size_t imgIdx, int /*keep_topk*/, const std::vector<float*>& outputs, cv::Size frameSize)
+	{
+		std::vector<tensor_rt::Result> resBoxes;
+
+		//0: name: images, size: 1x3x640x640
+		//1: name: output0, size: 1x84x8400
+
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+
+		auto output = outputs[0];
+
+		size_t ncInd = 1;
+		size_t lenInd = 2;
+		int nc = m_outpuDims[0].d[ncInd] - 4;
+		int dimensions = nc + 4;
+		size_t len = static_cast<size_t>(m_outpuDims[0].d[lenInd]) / m_params.explicitBatchSize;
+		//auto Volume = [](const nvinfer1::Dims& d)
+		//{
+		//    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int>());
+		//};
+		auto volume = len * m_outpuDims[0].d[ncInd]; // Volume(m_outpuDims[0]);
+		output += volume * imgIdx;
+		//std::cout << "len = " << len << ", nc = " << nc << ", m_params.confThreshold = " << m_params.confThreshold << ", volume = " << volume << std::endl;
+
+		cv::Mat rawMemory(1, dimensions * static_cast<int>(len), CV_32FC1, output);
+		rawMemory = rawMemory.reshape(1, dimensions);
+		cv::transpose(rawMemory, rawMemory);
+		output = (float*)rawMemory.data;
+
+		//std::cout << "output[0] mem:\n";
+		//for (size_t ii = 0; ii < 100; ++ii)
+		//{
+		//    std::cout << ii << ": ";
+		//    for (size_t jj = 0; jj < 20; ++jj)
+		//    {
+		//        std::cout << output[ii * 20 + jj] << " ";
+		//    }
+		//    std::cout << ";" << std::endl;
+		//}
+		//std::cout << ";" << std::endl;
+
+		std::vector<int> classIds;
+		std::vector<float> confidences;
+		std::vector<cv::Rect> rectBoxes;
+		classIds.reserve(len);
+		confidences.reserve(len);
+		rectBoxes.reserve(len);
+
+		for (size_t i = 0; i < len; ++i)
+		{
+			// Box
+			size_t k = i * (nc + 4);
+
+			int classId = -1;
+			float objectConf = 0.f;
+			for (int j = 0; j < nc; ++j)
+			{
+				const float classConf = output[k + 4 + j];
+				if (classConf > objectConf)
+				{
+					classId = j;
+					objectConf = classConf;
+				}
+			}
+
+			//if (i == 0)
+			//	std::cout << i << ": object_conf = " << object_conf << ", class_conf = " << class_conf << ", classId = " << classId << ", rect = " << cv::Rect(cvRound(x), cvRound(y), cvRound(width), cvRound(height)) << std::endl;
+
+			if (objectConf >= m_params.confThreshold)
+			{
+				classIds.push_back(classId);
+				confidences.push_back(objectConf);
+
+				// (center x, center y, width, height) to (x, y, w, h)
+				float x = fw * (output[k] - output[k + 2] / 2);
+				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float width = fw * output[k + 2];
+				float height = fh * output[k + 3];
+				rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height));
+			}
+		}
+
+		// Non-maximum suppression to eliminate redudant overlapping boxes
+		std::vector<int> indices;
+		cv::dnn::NMSBoxes(rectBoxes, confidences, m_params.confThreshold, m_params.nmsThreshold, indices);
+		resBoxes.reserve(indices.size());
+
+		for (size_t bi = 0; bi < indices.size(); ++bi)
+		{
+			resBoxes.emplace_back(classIds[indices[bi]], confidences[indices[bi]], rectBoxes[indices[bi]]);
+		}
+
+		return resBoxes;
+	}
+};
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp
new file mode 100644
index 00000000..54fc6b01
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp
@@ -0,0 +1,301 @@
+#pragma once
+
+#include "YoloONNX.hpp"
+
+///
+/// \brief The YOLOv11_instance_onnx class
+///
+class YOLOv11_instance_onnx : public YoloONNX
+{
+protected:
+	///
+	/// \brief GetResult
+	/// \param output
+	/// \return
+	///
+	std::vector<tensor_rt::Result> GetResult(size_t imgIdx, int /*keep_topk*/, const std::vector<float*>& outputs, cv::Size frameSize)
+	{
+		std::vector<tensor_rt::Result> resBoxes;
+
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+
+		size_t outInd = (outputs.size() == 0) ? 1 : 0;
+		size_t segInd = (outputs.size() == 0) ? 0 : 1;
+
+		auto output = outputs[0];
+
+		//std::cout << "output[1] mem:\n";
+		//auto output1 = outputs[1];
+		//for (size_t ii = 0; ii < 100; ++ii)
+		//{
+		//    std::cout << ii << ": ";
+		//    for (size_t jj = 0; jj < 20; ++jj)
+		//    {
+		//        std::cout << output1[ii * 20 + jj] << " ";
+		//    }
+		//    std::cout << ";" << std::endl;
+		//}
+		//std::cout << ";" << std::endl;
+
+		//0: name: images, size: 1x3x640x640
+		//1: name: output0, size: 1x116x8400
+		//2: name: output1, size: 1x32x160x160
+		// 25200 = 3x80x80 + 3x40x40 + 3x20x20
+		// 116 = x, y, w, h, 80 classes, 32 seg ancors
+		// 80 * 8 = 640, 40 * 16 = 640, 20 * 32 = 640
+
+		size_t ncInd = 1;
+		size_t lenInd = 2;
+		int nc = m_outpuDims[outInd].d[ncInd] - 4 - 32;
+		int dimensions = nc + 32 + 4;
+		size_t len = static_cast<size_t>(m_outpuDims[outInd].d[lenInd]) / m_params.explicitBatchSize;
+		//auto Volume = [](const nvinfer1::Dims& d)
+		//{
+		//    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int>());
+		//};
+		auto volume = len * m_outpuDims[outInd].d[ncInd]; // Volume(m_outpuDims[0]);
+		output += volume * imgIdx;
+		//std::cout << "len = " << len << ", nc = " << nc << ", m_params.confThreshold = " << m_params.confThreshold << ", volume = " << volume << std::endl;
+
+		cv::Mat rawMemory(1, dimensions * static_cast<int>(len), CV_32FC1, output);
+		rawMemory = rawMemory.reshape(1, dimensions);
+		cv::transpose(rawMemory, rawMemory);
+		output = (float*)rawMemory.data;
+
+		//std::cout << "output[0] mem:\n";
+		//for (size_t ii = 0; ii < 100; ++ii)
+		//{
+		//    std::cout << ii << ": ";
+		//    for (size_t jj = 0; jj < 20; ++jj)
+		//    {
+		//        std::cout << output[ii * 20 + jj] << " ";
+		//    }
+		//    std::cout << ";" << std::endl;
+		//}
+		//std::cout << ";" << std::endl;
+
+#if 1
+		int segWidth = 160;
+		int segHeight = 160;
+		int segChannels = 32;
+
+		if (outputs.size() > 1)
+		{
+			//std::cout << "output1 nbDims: " << m_outpuDims[segInd].nbDims << ", ";
+			//for (size_t i = 0; i < m_outpuDims[segInd].nbDims; ++i)
+			//{
+			//    std::cout << m_outpuDims[segInd].d[i];
+			//    if (i + 1 != m_outpuDims[segInd].nbDims)
+			//        std::cout << "x";
+			//}
+			//std::cout << std::endl;
+			//std::cout << "output nbDims: " << m_outpuDims[outInd].nbDims << ", ";
+			//for (size_t i = 0; i < m_outpuDims[outInd].nbDims; ++i)
+			//{
+			//    std::cout << m_outpuDims[outInd].d[i];
+			//    if (i + 1 != m_outpuDims[outInd].nbDims)
+			//        std::cout << "x";
+			//}
+			//std::cout << std::endl;
+
+			segChannels = m_outpuDims[segInd].d[1];
+			segWidth = m_outpuDims[segInd].d[2];
+			segHeight = m_outpuDims[segInd].d[3];
+		}
+		cv::Mat maskProposals;
+		std::vector<std::vector<float>> picked_proposals;
+		int net_width = nc + 4 + segChannels;
+#endif
+
+		std::vector<int> classIds;
+		std::vector<float> confidences;
+		std::vector<cv::Rect> rectBoxes;
+		classIds.reserve(len);
+		confidences.reserve(len);
+		rectBoxes.reserve(len);
+
+		for (size_t i = 0; i < len; ++i)
+		{
+			// Box
+			size_t k = i * (nc + 4 + 32);
+
+			int classId = -1;
+			float objectConf = 0.f;
+			for (int j = 0; j < nc; ++j)
+			{
+				const float classConf = output[k + 4 + j];
+				if (classConf > objectConf)
+				{
+					classId = j;
+					objectConf = classConf;
+				}
+			}
+
+			//if (i == 0)
+			//{
+			//    std::cout << "without nms: mem" << i << ": ";
+			//    for (size_t ii = 0; ii < 4; ++ii)
+			//    {
+			//        std::cout << output[k + ii] << " ";
+			//    }
+			//    std::cout << ";" << std::endl;
+			//    for (size_t ii = 4; ii < nc + 4; ++ii)
+			//    {
+			//        std::cout << output[k + ii] << " ";
+			//    }
+			//    std::cout << ";" << std::endl;
+			//    for (size_t ii = nc + 4; ii < nc + 4 + 32; ++ii)
+			//    {
+			//        std::cout << output[k + ii] << " ";
+			//    }
+			//    std::cout << ";" << std::endl;
+			//}
+
+			if (objectConf >= m_params.confThreshold)
+			{
+				// (center x, center y, width, height) to (x, y, w, h)
+				float x = fw * (output[k] - output[k + 2] / 2);
+				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float width = fw * output[k + 2];
+				float height = fh * output[k + 3];
+
+				//auto ClampToFrame = [](float& v, float& size, int hi) -> int
+				//{
+				//    int res = 0;
+//
+				//    if (size < 1)
+				//        size = 0;
+//
+				//    if (v < 0)
+				//    {
+				//        res = v;
+				//        v = 0;
+				//        return res;
+				//    }
+				//    else if (v + size > hi - 1)
+				//    {
+				//        res = v;
+				//        v = hi - 1 - size;
+				//        if (v < 0)
+				//        {
+				//            size += v;
+				//            v = 0;
+				//        }
+				//        res -= v;
+				//        return res;
+				//    }
+				//    return res;
+				//};
+				//ClampToFrame(x, width, frameSize.width);
+				//ClampToFrame(y, height, frameSize.height);
+
+				//if (i == 0)
+				//	std::cout << i << ": object_conf = " << object_conf << ", class_conf = " << class_conf << ", classId = " << classId << ", rect = " << cv::Rect(cvRound(x), cvRound(y), cvRound(width), cvRound(height)) << std::endl;
+
+				if (width > 4 && height > 4)
+				{
+					classIds.push_back(classId);
+					confidences.push_back(objectConf);
+					rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height));
+
+					std::vector<float> temp_proto(output + k + 4 + nc, output + k + net_width);
+					picked_proposals.push_back(temp_proto);
+				}
+			}
+		}
+
+		// Non-maximum suppression to eliminate redudant overlapping boxes
+		std::vector<int> indices;
+		cv::dnn::NMSBoxes(rectBoxes, confidences, m_params.confThreshold, m_params.nmsThreshold, indices);
+		resBoxes.reserve(indices.size());
+
+		for (size_t bi = 0; bi < indices.size(); ++bi)
+		{
+			resBoxes.emplace_back(classIds[indices[bi]], confidences[indices[bi]], Clamp(rectBoxes[indices[bi]], frameSize));
+			maskProposals.push_back(cv::Mat(picked_proposals[indices[bi]]).t());
+		}
+
+		if (!maskProposals.empty())
+		{
+			// Mask processing
+			const float* pdata = outputs[1];
+			std::vector<float> maskFloat(pdata, pdata + segChannels * segWidth * segHeight);
+
+			int INPUT_W = m_inputDims.d[3];
+			int INPUT_H = m_inputDims.d[2];
+			static constexpr float MASK_THRESHOLD = 0.5;
+
+			cv::Mat mask_protos = cv::Mat(maskFloat);
+			cv::Mat protos = mask_protos.reshape(0, { segChannels, segWidth * segHeight });
+
+			cv::Mat matmulRes = (maskProposals * protos).t();//n*32 32*25600
+			cv::Mat masks = matmulRes.reshape(static_cast<int>(resBoxes.size()), { segWidth, segHeight });
+			std::vector<cv::Mat> maskChannels;
+			split(masks, maskChannels);
+			for (size_t i = 0; i < resBoxes.size(); ++i)
+			{
+				cv::Mat dest;
+				cv::Mat mask;
+				//sigmoid
+				cv::exp(-maskChannels[i], dest);
+				dest = 1.0 / (1.0 + dest);//160*160
+
+				int padw = 0;
+				int padh = 0;
+				cv::Rect roi(int((float)padw / INPUT_W * segWidth), int((float)padh / INPUT_H * segHeight), int(segWidth - padw / 2), int(segHeight - padh / 2));
+				dest = dest(roi);
+
+				cv::resize(dest, mask, frameSize, cv::INTER_NEAREST);
+
+				resBoxes[i].m_boxMask = mask(resBoxes[i].m_brect) > MASK_THRESHOLD;
+
+#if 0
+				static int globalObjInd = 0;
+				SaveMat(resBoxes[i].m_boxMask, std::to_string(globalObjInd++), ".png", "tmp", true);
+#endif
+
+				std::vector<std::vector<cv::Point>> contours;
+				std::vector<cv::Vec4i> hierarchy;
+#if (CV_VERSION_MAJOR < 4)
+				cv::findContours(resBoxes[i].m_boxMask, contours, hierarchy, CV_RETR_EXTERNAL, CV_CHAIN_APPROX_SIMPLE, cv::Point());
+#else
+				cv::findContours(resBoxes[i].m_boxMask, contours, hierarchy, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE, cv::Point());
+#endif
+				for (const auto& contour : contours)
+				{
+					cv::Rect br = cv::boundingRect(contour);
+
+					if (br.width >= 4 &&
+						br.height >= 4)
+					{
+						cv::RotatedRect rr = (contour.size() < 5) ? cv::minAreaRect(contour) : cv::fitEllipse(contour);
+
+						br.x += resBoxes[i].m_brect.x;
+						br.y += resBoxes[i].m_brect.y;
+						rr.center.x += resBoxes[i].m_brect.x;
+						rr.center.y += resBoxes[i].m_brect.y;
+
+						//std::cout << "rr: " << rr.center << ", " << rr.angle << ", " << rr.size << std::endl;
+
+						if (resBoxes[i].m_boxMask.size() != br.size())
+						{
+							br.width = resBoxes[i].m_boxMask.cols;
+							br.height = resBoxes[i].m_boxMask.rows;
+							if (br.x + br.width >= frameSize.width)
+								br.x = frameSize.width - br.width;
+							if (br.y + br.height >= frameSize.height)
+								br.y = frameSize.height - br.height;
+						}
+
+						resBoxes[i].m_brect = br;
+						resBoxes[i].m_rrect = rr;
+
+						break;
+					}
+				}
+			}
+		}
+		return resBoxes;
+	}
+};
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp
new file mode 100644
index 00000000..7c2b98ce
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include "YoloONNX.hpp"
+
+///
+/// \brief The YOLOv11_obb_onnx class
+///
+class YOLOv11_obb_onnx : public YoloONNX
+{
+protected:
+	///
+	/// \brief GetResult
+	/// \param output
+	/// \return
+	///
+	std::vector<tensor_rt::Result> GetResult(size_t imgIdx, int /*keep_topk*/, const std::vector<float*>& outputs, cv::Size frameSize)
+	{
+		std::vector<tensor_rt::Result> resBoxes;
+
+		//0: name: images, size: 1x3x1024x1024
+		//1: name: output0, size: 1x20x21504
+		//20: 15 DOTA classes + x + y + w + h + a
+		constexpr int shapeDataSize = 5;
+
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+
+		auto output = outputs[0];
+
+		size_t ncInd = 1;
+		size_t lenInd = 2;
+		int nc = m_outpuDims[0].d[ncInd] - shapeDataSize;
+		int dimensions = nc + shapeDataSize;
+		size_t len = static_cast<size_t>(m_outpuDims[0].d[lenInd]) / m_params.explicitBatchSize;
+		//auto Volume = [](const nvinfer1::Dims& d)
+		//{
+		//    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int>());
+		//};
+		auto volume = len * m_outpuDims[0].d[ncInd]; // Volume(m_outpuDims[0]);
+		output += volume * imgIdx;
+		//std::cout << "len = " << len << ", nc = " << nc << ", m_params.confThreshold = " << m_params.confThreshold << ", volume = " << volume << std::endl;
+
+		cv::Mat rawMemory(1, dimensions * static_cast<int>(len), CV_32FC1, output);
+		rawMemory = rawMemory.reshape(1, dimensions);
+		cv::transpose(rawMemory, rawMemory);
+		output = (float*)rawMemory.data;
+
+		//std::cout << "output[0] mem:\n";
+		//for (size_t ii = 0; ii < 100; ++ii)
+		//{
+		//    std::cout << ii << ": ";
+		//    for (size_t jj = 0; jj < 20; ++jj)
+		//    {
+		//        std::cout << output[ii * 20 + jj] << " ";
+		//    }
+		//    std::cout << ";" << std::endl;
+		//}
+		//std::cout << ";" << std::endl;
+
+		std::vector<int> classIds;
+		std::vector<float> confidences;
+		std::vector<cv::RotatedRect> rectBoxes;
+		classIds.reserve(len);
+		confidences.reserve(len);
+		rectBoxes.reserve(len);
+
+		for (size_t i = 0; i < len; ++i)
+		{
+			// Box
+			size_t k = i * (nc + shapeDataSize);
+
+			int classId = -1;
+			float objectConf = 0.f;
+			for (int j = 0; j < nc; ++j)
+			{
+				const float classConf = output[k + 4 + j];
+				if (classConf > objectConf)
+				{
+					classId = j;
+					objectConf = classConf;
+				}
+			}
+
+			//if (i == 0)
+			//{
+			//	for (int jj = 0; jj < 20; ++jj)
+			//	{
+			//		std::cout << output[jj] << " ";
+			//	}
+			//	std::cout << std::endl;
+			//}
+
+			if (objectConf >= m_params.confThreshold)
+			{
+				classIds.push_back(classId);
+				confidences.push_back(objectConf);
+
+				// (center x, center y, width, height)
+				float cx = fw * output[k];
+				float cy = fh * output[k + 1];
+				float width = fw * output[k + 2];
+				float height = fh * output[k + 3];
+				float angle = 180.f * output[k + nc + shapeDataSize - 1] / M_PI;
+				rectBoxes.emplace_back(cv::Point2f(cx, cy), cv::Size2f(width, height), angle);
+
+				//if (rectBoxes.size() == 1)
+				//	std::cout << i << ": object_conf = " << objectConf << ", classId = " << classId << ", rect = " << rectBoxes.back().boundingRect() << ", angle = " << angle << std::endl;
+			}
+		}
+
+		// Non-maximum suppression to eliminate redudant overlapping boxes
+		//std::vector<int> indices;
+		//cv::dnn::NMSBoxes(rectBoxes, confidences, m_params.confThreshold, m_params.nmsThreshold, indices);
+		//resBoxes.reserve(indices.size());
+
+		resBoxes.reserve(rectBoxes.size());
+		for (size_t bi = 0; bi < rectBoxes.size(); ++bi)
+		{
+			resBoxes.emplace_back(classIds[bi], confidences[bi], rectBoxes[bi]);
+		}
+
+		return resBoxes;
+	}
+};
diff --git a/src/Detector/tensorrt_yolo/class_detector.cpp b/src/Detector/tensorrt_yolo/class_detector.cpp
index f7a18e23..895e5d96 100644
--- a/src/Detector/tensorrt_yolo/class_detector.cpp
+++ b/src/Detector/tensorrt_yolo/class_detector.cpp
@@ -10,6 +10,10 @@
 #include "YoloONNXv8_instance.hpp"
 #include "YoloONNXv9_bb.hpp"
 #include "YoloONNXv10_bb.hpp"
+#include "YoloONNXv11_bb.hpp"
+#include "YoloONNXv11_obb.hpp"
+#include "YoloONNXv11_instance.hpp"
+
 
 namespace tensor_rt
 {
@@ -110,6 +114,22 @@ namespace tensor_rt
                 m_params.outputTensorNames.push_back("output0");
                 m_detector = std::make_unique<YOLOv10_bb_onnx>();
                 break;
+            case ModelType::YOLOV11:
+                m_params.inputTensorNames.push_back("images");
+                m_params.outputTensorNames.push_back("output0");
+                m_detector = std::make_unique<YOLOv11_bb_onnx>();
+                break;
+            case ModelType::YOLOV11_OBB:
+                m_params.inputTensorNames.push_back("images");
+                m_params.outputTensorNames.push_back("output0");
+                m_detector = std::make_unique<YOLOv11_obb_onnx>();
+                break;
+            case ModelType::YOLOV11Mask:
+                m_params.inputTensorNames.push_back("images");
+                m_params.outputTensorNames.push_back("output0");
+                m_params.outputTensorNames.push_back("output1");
+                m_detector = std::make_unique<YOLOv11_instance_onnx>();
+                break;
             }                
 
             // Threshold values
@@ -193,7 +213,8 @@ namespace tensor_rt
         if (config.net_type == ModelType::YOLOV6 ||
             config.net_type == ModelType::YOLOV7 || config.net_type == ModelType::YOLOV7Mask ||
             config.net_type == ModelType::YOLOV8 || config.net_type == ModelType::YOLOV8_OBB || config.net_type == ModelType::YOLOV8Mask ||
-            config.net_type == ModelType::YOLOV9 || config.net_type == ModelType::YOLOV10)
+            config.net_type == ModelType::YOLOV9 || config.net_type == ModelType::YOLOV10 ||
+            config.net_type == ModelType::YOLOV11 || config.net_type == ModelType::YOLOV11_OBB || config.net_type == ModelType::YOLOV11Mask)
             m_impl = new YoloONNXImpl();
         else
             m_impl = new YoloDectectorImpl();
diff --git a/src/Detector/tensorrt_yolo/class_detector.h b/src/Detector/tensorrt_yolo/class_detector.h
index 1dd85d70..b4da0d0a 100644
--- a/src/Detector/tensorrt_yolo/class_detector.h
+++ b/src/Detector/tensorrt_yolo/class_detector.h
@@ -54,7 +54,10 @@ namespace tensor_rt
         YOLOV8_OBB,
         YOLOV8Mask,
         YOLOV9,
-        YOLOV10
+        YOLOV10,
+        YOLOV11,
+        YOLOV11_OBB,
+        YOLOV11Mask
 	};
 
     ///
diff --git a/src/Detector/tensorrt_yolo/cmake/FindTensorRT.cmake b/src/Detector/tensorrt_yolo/cmake/FindTensorRT.cmake
index 7f07dd36..f4f9f42c 100644
--- a/src/Detector/tensorrt_yolo/cmake/FindTensorRT.cmake
+++ b/src/Detector/tensorrt_yolo/cmake/FindTensorRT.cmake
@@ -1,39 +1,51 @@
+# ~~~
+# Copyright 2021 Olivier Le Doeuff
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 # This module defines the following variables:
 #
-# ::
+# - TensorRT_FOUND: A boolean specifying whether or not TensorRT was found.
+# - TensorRT_VERSION: The exact version of TensorRT found
+# - TensorRT_VERSION_MAJOR: The major version of TensorRT.
+# - TensorRT_VERSION_MINOR: The minor version of TensorRT.
+# - TensorRT_VERSION_PATCH: The patch version of TensorRT.
+# - TensorRT_VERSION_TWEAK: The tweak version of TensorRT.
+# - TensorRT_INCLUDE_DIRS: The path to TensorRT ``include`` folder containing the header files    required to compile a project linking against TensorRT.
+# - TensorRT_LIBRARY_DIRS: The path to TensorRT library directory that contains libraries.
 #
-#   TensorRT_INCLUDE_DIRS
-#   TensorRT_LIBRARIES
-#   TensorRT_FOUND
-#
-# ::
-#
-#   TensorRT_VERSION_STRING - version (x.y.z)
-#   TensorRT_VERSION_MAJOR  - major version (x)
-#   TensorRT_VERSION_MINOR  - minor version (y)
-#   TensorRT_VERSION_PATCH  - patch version (z)
+# This module create following targets:
+# - trt::nvinfer
+# - trt::nvinfer_plugin
+# - trt::nvonnxparser
+# - trt::nvparsers
+# This script was inspired from https://github.com/NicolasIRAGNE/CMakeScripts
+# This script was inspired from https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake
 #
 # Hints
 # ^^^^^
 # A user may set ``TensorRT_ROOT`` to an installation root to tell this module where to look.
-#
-set(_TensorRT_SEARCHES)
+# ~~~
 
-if(TensorRT_ROOT)
-  set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_ROOT} NO_DEFAULT_PATH)
-  list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT)
+if(NOT TensorRT_FIND_COMPONENTS)
+  set(TensorRT_FIND_COMPONENTS nvinfer nvinfer_plugin nvonnxparser)
 endif()
+set(TensorRT_LIBRARIES)
 
-# appends some common paths
-set(_TensorRT_SEARCH_NORMAL
-  PATHS "/usr"
+# find the include directory of TensorRT
+find_path(
+  TensorRT_INCLUDE_DIR
+  NAMES NvInfer.h
+  PATHS ${TensorRT_ROOT} ENV TensorRT_ROOT
+  PATH_SUFFIXES include
 )
-list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL)
 
-# Include dir
-foreach(search ${_TensorRT_SEARCHES})
-  find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include)
-endforeach()
+string(FIND ${TensorRT_INCLUDE_DIR} "NOTFOUND" _include_dir_notfound)
+if(NOT _include_dir_notfound EQUAL -1)
+  if(TensorRT_FIND_REQUIRED)
+    message(FATAL_ERROR "Fail to find TensorRT, please set TensorRT_ROOT. Include path not found.")
+  endif()
+  return()
 
 if(NOT TensorRT_LIBRARY)
   foreach(search ${_TensorRT_SEARCHES})
@@ -42,34 +54,71 @@ if(NOT TensorRT_LIBRARY)
     find_library(TRT_NVINFER_PLUGIN NAMES nvinfer_plugin ${${search}} PATH_SUFFIXES lib lib64 lib/x64)
   endforeach()
   list(APPEND TensorRT_LIBRARY ${TRT_NVINFER} ${TRT_NVINFER_PLUGIN} ${TRT_NVONNX_PARSER})
-endif()
 
-mark_as_advanced(TensorRT_INCLUDE_DIR)
+endif()
+set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
 
-if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h")
-    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
-    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
-    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
+# Extract version of tensorrt
+if(EXISTS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h")
+  file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
+  file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
+  file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
+  file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_TWEAK REGEX "^#define NV_TENSORRT_BUILD [0-9]+.*$")
 
-    string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
-    string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
-    string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
-    set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
+  string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
+  string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
+  string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
+  string(REGEX REPLACE "^#define NV_TENSORRT_BUILD ([0-9]+).*$" "\\1" TensorRT_VERSION_TWEAK "${TensorRT_TWEAK}")
+  set(TensorRT_VERSION "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}.${TensorRT_VERSION_TWEAK}")
 endif()
 
-include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING)
+function(_find_trt_component component)
+
+  # Find library for component (ie nvinfer, nvparsers, etc...)
+  find_library(
+    TensorRT_${component}_LIBRARY
+    NAMES ${component}
+    PATHS ${TensorRT_ROOT} ${TENSORRT_LIBRARY_DIR} ENV TensorRT_ROOT
+  )
 
-if(TensorRT_FOUND)
-  set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
+  string(FIND ${TensorRT_${component}_LIBRARY} "NOTFOUND" _library_not_found)
 
-  if(NOT TensorRT_LIBRARIES)
-    set(TensorRT_LIBRARIES ${TensorRT_LIBRARY})
+  if(NOT TensorRT_LIBRARY_DIR)
+    get_filename_component(_path ${TensorRT_${component}_LIBRARY} DIRECTORY)
+    set(TensorRT_LIBRARY_DIR
+        "${_path}"
+        CACHE INTERNAL "TensorRT_LIBRARY_DIR"
+    )
   endif()
 
-  if(NOT TARGET TensorRT::TensorRT)
-    add_library(TensorRT::TensorRT UNKNOWN IMPORTED)
-    set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
-    set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}")
+  if(NOT TensorRT_LIBRARY_DIRS)
+    get_filename_component(_path ${TensorRT_${component}_LIBRARY} DIRECTORY)
+    set(TensorRT_LIBRARY_DIRS
+        "${_path}"
+        CACHE INTERNAL "TensorRT_LIBRARY_DIRS"
+    )
   endif()
-endif()
+
+  # Library found, and doesn't already exists
+  if(_library_not_found EQUAL -1 AND NOT TARGET trt::${component})
+    set(TensorRT_${component}_FOUND
+        TRUE
+        CACHE INTERNAL "Found ${component}"
+    )
+
+    # Create a target
+    add_library(trt::${component} IMPORTED INTERFACE)
+    target_include_directories(trt::${component} SYSTEM INTERFACE "${TensorRT_INCLUDE_DIRS}")
+    target_link_libraries(trt::${component} INTERFACE "${TensorRT_${component}_LIBRARY}")
+    set(TensorRT_LIBRARIES ${TensorRT_LIBRARIES} ${TensorRT_${component}_LIBRARY})
+  endif()
+
+endfunction()
+
+# Find each components
+foreach(component IN LISTS TensorRT_FIND_COMPONENTS)
+  _find_trt_component(${component})
+endforeach()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(TensorRT HANDLE_COMPONENTS VERSION_VAR TensorRT_VERSION REQUIRED_VARS TensorRT_INCLUDE_DIR)
diff --git a/src/Detector/tensorrt_yolo/common/BatchStream.h b/src/Detector/tensorrt_yolo/common/BatchStream.h
index a8da9923..c4ab9de0 100644
--- a/src/Detector/tensorrt_yolo/common/BatchStream.h
+++ b/src/Detector/tensorrt_yolo/common/BatchStream.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -119,7 +120,7 @@ class MNISTBatchStream : public IBatchStream
         file.read(reinterpret_cast<char*>(rawData.data()), numElements * sizeof(uint8_t));
         mData.resize(numElements);
         std::transform(
-            rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast<float>(val) / 255.f; });
+            rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast<float>(val) / 255.F; });
     }
 
     void readLabelsFile(const std::string& labelsFilePath)
@@ -152,42 +153,39 @@ class MNISTBatchStream : public IBatchStream
 class BatchStream : public IBatchStream
 {
 public:
-    BatchStream(
-        int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector<std::string> directories)
+    BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::string const& suffix,
+        std::vector<std::string> const& directories)
         : mBatchSize(batchSize)
         , mMaxBatches(maxBatches)
         , mPrefix(prefix)
         , mSuffix(suffix)
         , mDataDir(directories)
     {
-        FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb");
-        ASSERT(file != nullptr);
+        std::ifstream file(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), std::ios::binary);
+        ASSERT(file.good());
         int d[4];
-        size_t readSize = fread(d, sizeof(int), 4, file);
-        ASSERT(readSize == 4);
+        file.read(reinterpret_cast<char*>(d), 4 * sizeof(int32_t));
         mDims.nbDims = 4;  // The number of dimensions.
         mDims.d[0] = d[0]; // Batch Size
         mDims.d[1] = d[1]; // Channels
         mDims.d[2] = d[2]; // Height
         mDims.d[3] = d[3]; // Width
         ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0);
-        fclose(file);
 
         mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3];
         mBatch.resize(mBatchSize * mImageSize, 0);
         mLabels.resize(mBatchSize, 0);
         mFileBatch.resize(mDims.d[0] * mImageSize, 0);
         mFileLabels.resize(mDims.d[0], 0);
-        reset(0);
     }
 
-    BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector<std::string> directories)
+    BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::vector<std::string> const& directories)
         : BatchStream(batchSize, maxBatches, prefix, ".batch", directories)
     {
     }
 
-    BatchStream(
-        int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector<std::string> directories)
+    BatchStream(int batchSize, int maxBatches, nvinfer1::Dims const& dims, std::string const& listFile,
+        std::vector<std::string> const& directories)
         : mBatchSize(batchSize)
         , mMaxBatches(maxBatches)
         , mDims(dims)
@@ -199,7 +197,6 @@ class BatchStream : public IBatchStream
         mLabels.resize(mBatchSize, 0);
         mFileBatch.resize(mDims.d[0] * mImageSize, 0);
         mFileLabels.resize(mDims.d[0], 0);
-        reset(0);
     }
 
     // Resets data members
@@ -219,7 +216,7 @@ class BatchStream : public IBatchStream
             return false;
         }
 
-        for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
+        for (int64_t csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
         {
             ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]);
             if (mFileBatchPos == mDims.d[0] && !update())
@@ -228,7 +225,7 @@ class BatchStream : public IBatchStream
             }
 
             // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
-            csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos);
+            csize = std::min<int64_t>(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos);
             std::copy_n(
                 getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
             std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos);
@@ -295,22 +292,16 @@ class BatchStream : public IBatchStream
         if (mListFile.empty())
         {
             std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir);
-            FILE* file = fopen(inputFileName.c_str(), "rb");
+            std::ifstream file(inputFileName.c_str(), std::ios::binary);
             if (!file)
             {
                 return false;
             }
-
             int d[4];
-            size_t readSize = fread(d, sizeof(int), 4, file);
-            ASSERT(readSize == 4);
+            file.read(reinterpret_cast<char*>(d), 4 * sizeof(int32_t));
             ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]);
-            size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file);
-            ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize));
-            size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file);
-            ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0]));
-
-            fclose(file);
+            file.read(reinterpret_cast<char*>(getFileBatch()), sizeof(float) * mDims.d[0] * mImageSize);
+            file.read(reinterpret_cast<char*>(getFileLabels()), sizeof(float) * mDims.d[0]);
         }
         else
         {
@@ -368,7 +359,7 @@ class BatchStream : public IBatchStream
         return true;
     }
 
-    int mBatchSize{0};
+    int64_t mBatchSize{0};
     int mMaxBatches{0};
     int mBatchCount{0};
     int mFileCount{0};
diff --git a/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h b/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h
index f31789bf..67a0130e 100644
--- a/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h
+++ b/src/Detector/tensorrt_yolo/common/EntropyCalibrator.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -28,8 +29,8 @@ template <typename TBatchStream>
 class EntropyCalibratorImpl
 {
 public:
-    EntropyCalibratorImpl(
-        TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true)
+    EntropyCalibratorImpl(TBatchStream const& stream, int firstBatch, std::string const& networkName,
+        const char* inputBlobName, bool readCache = true)
         : mStream{stream}
         , mCalibrationTableName("CalibrationTable" + networkName)
         , mInputBlobName(inputBlobName)
@@ -51,11 +52,12 @@ class EntropyCalibratorImpl
         return mStream.getBatchSize();
     }
 
-    bool getBatch(void* bindings[], const char* names[], int /*nbBindings*/) noexcept
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept
     {
         if (!mStream.next())
+        {
             return false;
-
+        }
         CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
         ASSERT(!strcmp(names[0], mInputBlobName));
         bindings[0] = mDeviceInput;
@@ -101,8 +103,8 @@ template <typename TBatchStream>
 class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
 {
 public:
-    Int8EntropyCalibrator2(
-        TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true)
+    Int8EntropyCalibrator2(TBatchStream const& stream, int32_t firstBatch, const char* networkName,
+        const char* inputBlobName, bool readCache = true)
         : mImpl(stream, firstBatch, networkName, inputBlobName, readCache)
     {
     }
diff --git a/src/Detector/tensorrt_yolo/common/ErrorRecorder.h b/src/Detector/tensorrt_yolo/common/ErrorRecorder.h
index 40b35fb5..bfb857c5 100644
--- a/src/Detector/tensorrt_yolo/common/ErrorRecorder.h
+++ b/src/Detector/tensorrt_yolo/common/ErrorRecorder.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -16,7 +17,7 @@
 
 #ifndef ERROR_RECORDER_H
 #define ERROR_RECORDER_H
-#include "NvInferRuntimeCommon.h"
+#include "NvInferRuntime.h"
 #include "logger.h"
 #include <atomic>
 #include <cstdint>
@@ -44,7 +45,7 @@ class SampleErrorRecorder : public IErrorRecorder
 public:
     SampleErrorRecorder() = default;
 
-    virtual ~SampleErrorRecorder() noexcept {}
+    ~SampleErrorRecorder() noexcept override {}
     int32_t getNbErrors() const noexcept final
     {
         return mErrorStack.size();
diff --git a/src/Detector/tensorrt_yolo/common/argsParser.h b/src/Detector/tensorrt_yolo/common/argsParser.h
new file mode 100644
index 00000000..1f0b9025
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/argsParser.h
@@ -0,0 +1,162 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TENSORRT_ARGS_PARSER_H
+#define TENSORRT_ARGS_PARSER_H
+
+#ifdef _MSC_VER
+#include "getOptWin.h"
+#else
+#include <getopt.h>
+#endif
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace samplesCommon
+{
+
+//!
+//! \brief The SampleParams structure groups the basic parameters required by
+//!        all sample networks.
+//!
+struct SampleParams
+{
+    int32_t batchSize{1};              //!< Number of inputs in a batch
+    int32_t dlaCore{-1};               //!< Specify the DLA core to run network on.
+    bool int8{false};                  //!< Allow runnning the network in Int8 mode.
+    bool fp16{false};                  //!< Allow running the network in FP16 mode.
+    bool bf16{false};                  //!< Allow running the network in BF16 mode.
+    std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
+    std::vector<std::string> inputTensorNames;
+    std::vector<std::string> outputTensorNames;
+    std::string timingCacheFile; //!< Path to timing cache file
+};
+
+//!
+//! \brief The OnnxSampleParams structure groups the additional parameters required by
+//!         networks that use ONNX
+//!
+struct OnnxSampleParams : public SampleParams
+{
+    std::string onnxFileName; //!< Filename of ONNX file of a network
+};
+
+//!
+//! /brief Struct to maintain command-line arguments.
+//!
+struct Args
+{
+    bool runInInt8{false};
+    bool runInFp16{false};
+    bool runInBf16{false};
+    bool help{false};
+    int32_t useDLACore{-1};
+    int32_t batch{1};
+    std::vector<std::string> dataDirs;
+    std::string saveEngine;
+    std::string loadEngine;
+    bool rowOrder{true};
+    std::string timingCacheFile;
+};
+
+//!
+//! \brief Populates the Args struct with the provided command-line parameters.
+//!
+//! \throw invalid_argument if any of the arguments are not valid
+//!
+//! \return boolean If return value is true, execution can continue, otherwise program should exit
+//!
+inline bool parseArgs(Args& args, int32_t argc, char* argv[])
+{
+    while (1)
+    {
+        int32_t arg;
+        static struct option long_options[]
+            = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'}, {"int8", no_argument, 0, 'i'},
+                {"fp16", no_argument, 0, 'f'}, {"bf16", no_argument, 0, 'z'}, {"columnOrder", no_argument, 0, 'c'},
+                {"saveEngine", required_argument, 0, 's'}, {"loadEngine", required_argument, 0, 'o'},
+                {"useDLACore", required_argument, 0, 'u'}, {"batch", required_argument, 0, 'b'},
+                {"timingCacheFile", required_argument, 0, 't'}, {nullptr, 0, nullptr, 0}};
+        int32_t option_index = 0;
+        arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
+        if (arg == -1)
+        {
+            break;
+        }
+
+        switch (arg)
+        {
+        case 'h': args.help = true; return true;
+        case 'd':
+            if (optarg)
+            {
+                args.dataDirs.push_back(optarg);
+            }
+            else
+            {
+                std::cerr << "ERROR: --datadir requires option argument" << std::endl;
+                return false;
+            }
+            break;
+        case 's':
+            if (optarg)
+            {
+                args.saveEngine = optarg;
+            }
+            break;
+        case 'o':
+            if (optarg)
+            {
+                args.loadEngine = optarg;
+            }
+            break;
+        case 'i': args.runInInt8 = true; break;
+        case 'f': args.runInFp16 = true; break;
+        case 'z': args.runInBf16 = true; break;
+        case 'c': args.rowOrder = false; break;
+        case 'u':
+            if (optarg)
+            {
+                args.useDLACore = std::stoi(optarg);
+            }
+            break;
+        case 'b':
+            if (optarg)
+            {
+                args.batch = std::stoi(optarg);
+            }
+            break;
+        case 't':
+            if (optarg)
+            {
+                args.timingCacheFile = optarg;
+            }
+            else
+            {
+                std::cerr << "ERROR: --timingCacheFile requires option argument" << std::endl;
+                return false;
+            }
+            break;
+        default: return false;
+        }
+    }
+    return true;
+}
+
+} // namespace samplesCommon
+
+#endif // TENSORRT_ARGS_PARSER_H
diff --git a/src/Detector/tensorrt_yolo/common/bfloat16.cpp b/src/Detector/tensorrt_yolo/common/bfloat16.cpp
new file mode 100644
index 00000000..8222826a
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/bfloat16.cpp
@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bfloat16.h"
+#include <cstring>
+
+namespace sample
+{
+
+BFloat16::operator float() const
+{
+    static_assert(sizeof(uint32_t) == sizeof(float), "");
+    float val{0.F};
+    auto bits = static_cast<uint32_t>(mRep) << 16;
+    std::memcpy(&val, &bits, sizeof(uint32_t));
+    return val;
+}
+
+BFloat16::BFloat16(float x)
+{
+    static_assert(sizeof(uint32_t) == sizeof(float), "");
+    uint32_t bits{0};
+    std::memcpy(&bits, &x, sizeof(float));
+
+    // FP32 format: 1 sign bit, 8 bit exponent, 23 bit mantissa
+    // BF16 format: 1 sign bit, 8 bit exponent, 7 bit mantissa
+
+    // Mask for exponent
+    constexpr uint32_t exponent = 0xFFU << 23;
+
+    // Check if exponent is all 1s (NaN or infinite)
+    if ((bits & exponent) != exponent)
+    {
+        // x is finite - round to even
+        bits += 0x7FFFU + (bits >> 16 & 1);
+    }
+
+    mRep = static_cast<uint16_t>(bits >> 16);
+}
+
+BFloat16 operator+(BFloat16 x, BFloat16 y)
+{
+    return BFloat16(static_cast<float>(x) + static_cast<float>(y));
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/bfloat16.h b/src/Detector/tensorrt_yolo/common/bfloat16.h
new file mode 100644
index 00000000..0d0ab922
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/bfloat16.h
@@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace sample
+{
+
+//! Implements "Brain Floating Point": like an IEEE FP32,
+//! but the significand is only 7 bits instead of 23 bits.
+class BFloat16
+{
+public:
+    BFloat16()
+        : mRep(0)
+    {
+    }
+
+    // Rounds to even if there is a tie.
+    BFloat16(float x);
+
+    operator float() const;
+
+private:
+    //! Value stored in BFloat16 representation.
+    uint16_t mRep;
+};
+BFloat16 operator+(BFloat16 x, BFloat16 y);
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/buffers.h b/src/Detector/tensorrt_yolo/common/buffers.h
index ef673b2b..e58f2f5c 100644
--- a/src/Detector/tensorrt_yolo/common/buffers.h
+++ b/src/Detector/tensorrt_yolo/common/buffers.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -238,28 +239,53 @@ class BufferManager
 public:
     static const size_t kINVALID_SIZE_VALUE = ~size_t(0);
 
+    //!
+    //! \brief Create a BufferManager for handling buffer interactions with engine, when the I/O tensor volumes
+    //! are provided
+    //!
+    BufferManager(
+        std::shared_ptr<nvinfer1::ICudaEngine> engine, std::vector<int64_t> const& volumes, int32_t batchSize = 0)
+        : mEngine(engine)
+        , mBatchSize(batchSize)
+    {
+        // Create host and device buffers
+        for (int32_t i = 0; i < mEngine->getNbIOTensors(); i++)
+        {
+            auto const name = engine->getIOTensorName(i);
+            mNames[name] = i;
+
+            nvinfer1::DataType type = mEngine->getTensorDataType(name);
+
+            std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
+            manBuf->deviceBuffer = DeviceBuffer(volumes[i], type);
+            manBuf->hostBuffer = HostBuffer(volumes[i], type);
+            void* deviceBuffer = manBuf->deviceBuffer.data();
+            mDeviceBindings.emplace_back(deviceBuffer);
+            mManagedBuffers.emplace_back(std::move(manBuf));
+        }
+    }
+
     //!
     //! \brief Create a BufferManager for handling buffer interactions with engine.
     //!
-    BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, const int batchSize,
-        const nvinfer1::IExecutionContext* context = nullptr)
+    BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, int32_t const batchSize = 0,
+        nvinfer1::IExecutionContext const* context = nullptr)
         : mEngine(engine)
         , mBatchSize(batchSize)
     {
-        // Full Dims implies no batch size.
-        auto impbs = engine->hasImplicitBatchDimension();
-        std::cout << "hasImplicitBatchDimension: " << impbs << ", mBatchSize = " << mBatchSize << std::endl;
-        assert(engine->hasImplicitBatchDimension() || mBatchSize == 0);
         // Create host and device buffers
-        for (int i = 0; i < mEngine->getNbBindings(); i++)
+        for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; i++)
         {
-            auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i);
+            auto const name = engine->getIOTensorName(i);
+            mNames[name] = i;
+
+            auto dims = context ? context->getTensorShape(name) : mEngine->getTensorShape(name);
             size_t vol = context || !mBatchSize ? 1 : static_cast<size_t>(mBatchSize);
-            nvinfer1::DataType type = mEngine->getBindingDataType(i);
-            int vecDim = mEngine->getBindingVectorizedDim(i);
+            nvinfer1::DataType type = mEngine->getTensorDataType(name);
+            int32_t vecDim = mEngine->getTensorVectorizedDim(name);
             if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector
             {
-                int scalarsPerVec = mEngine->getBindingComponentsPerElement(i);
+                int32_t scalarsPerVec = mEngine->getTensorComponentsPerElement(name);
                 dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);
                 vol *= scalarsPerVec;
             }
@@ -267,7 +293,8 @@ class BufferManager
             std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
             manBuf->deviceBuffer = DeviceBuffer(vol, type);
             manBuf->hostBuffer = HostBuffer(vol, type);
-            mDeviceBindings.emplace_back(manBuf->deviceBuffer.data());
+            void* deviceBuffer = manBuf->deviceBuffer.data();
+            mDeviceBindings.emplace_back(deviceBuffer);
             mManagedBuffers.emplace_back(std::move(manBuf));
         }
     }
@@ -284,7 +311,7 @@ class BufferManager
     //!
     //! \brief Returns a vector of device buffers.
     //!
-    const std::vector<void*>& getDeviceBindings() const
+    std::vector<void*> const& getDeviceBindings() const
     {
         return mDeviceBindings;
     }
@@ -293,7 +320,7 @@ class BufferManager
     //! \brief Returns the device buffer corresponding to tensorName.
     //!        Returns nullptr if no such tensor can be found.
     //!
-    void* getDeviceBuffer(const std::string& tensorName) const
+    void* getDeviceBuffer(std::string const& tensorName) const
     {
         return getBuffer(false, tensorName);
     }
@@ -302,72 +329,21 @@ class BufferManager
     //! \brief Returns the host buffer corresponding to tensorName.
     //!        Returns nullptr if no such tensor can be found.
     //!
-    void* getHostBuffer(const std::string& tensorName) const
+    void* getHostBuffer(std::string const& tensorName) const
     {
         return getBuffer(true, tensorName);
     }
 
-    //!
-    //! \brief Returns the host buffer corresponding to tensorName.
-    //!        Returns nullptr if no such tensor can be found.
-    //!
-    void* getHostBuffer(int bindingIndex) const
-    {
-        return getBuffer(true, bindingIndex);
-    }
-
     //!
     //! \brief Returns the size of the host and device buffers that correspond to tensorName.
     //!        Returns kINVALID_SIZE_VALUE if no such tensor can be found.
     //!
-    size_t size(const std::string& tensorName) const
+    size_t size(std::string const& tensorName) const
     {
-        int index = mEngine->getBindingIndex(tensorName.c_str());
-        if (index == -1)
+        auto record = mNames.find(tensorName);
+        if (record == mNames.end())
             return kINVALID_SIZE_VALUE;
-        return mManagedBuffers[index]->hostBuffer.nbBytes();
-    }
-
-    //!
-    //! \brief Dump host buffer with specified tensorName to ostream.
-    //!        Prints error message to std::ostream if no such tensor can be found.
-    //!
-    void dumpBuffer(std::ostream& os, const std::string& tensorName)
-    {
-        int index = mEngine->getBindingIndex(tensorName.c_str());
-        if (index == -1)
-        {
-            os << "Invalid tensor name" << std::endl;
-            return;
-        }
-        void* buf = mManagedBuffers[index]->hostBuffer.data();
-        size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes();
-        nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index);
-        size_t rowCount = static_cast<size_t>(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize);
-        int leadDim = mBatchSize;
-        int* trailDims = bufDims.d;
-        int nbDims = bufDims.nbDims;
-
-        // Fix explicit Dimension networks
-        if (!leadDim && nbDims > 0)
-        {
-            leadDim = bufDims.d[0];
-            ++trailDims;
-            --nbDims;
-        }
-
-        os << "[" << leadDim;
-        for (int i = 0; i < nbDims; i++)
-            os << ", " << trailDims[i];
-        os << "]" << std::endl;
-        switch (mEngine->getBindingDataType(index))
-        {
-        case nvinfer1::DataType::kINT32: print<int32_t>(os, buf, bufSize, rowCount); break;
-        case nvinfer1::DataType::kFLOAT: print<float>(os, buf, bufSize, rowCount); break;
-        case nvinfer1::DataType::kHALF: print<half_float::half>(os, buf, bufSize, rowCount); break;
-        case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break;
-        case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break;
-        }
+        return mManagedBuffers[record->second]->hostBuffer.nbBytes();
     }
 
     //!
@@ -382,7 +358,7 @@ class BufferManager
         assert(bufSize % sizeof(T) == 0);
         T* typedBuf = static_cast<T*>(buf);
         size_t numItems = bufSize / sizeof(T);
-        for (int i = 0; i < static_cast<int>(numItems); i++)
+        for (int32_t i = 0; i < static_cast<int>(numItems); i++)
         {
             // Handle rowCount == 1 case
             if (rowCount == 1 && i != static_cast<int>(numItems) - 1)
@@ -404,7 +380,7 @@ class BufferManager
     //!
     void copyInputToDevice()
     {
-        memcpyBuffers(true, false, false, 0);
+        memcpyBuffers(true, false, false);
     }
 
     //!
@@ -412,13 +388,13 @@ class BufferManager
     //!
     void copyOutputToHost()
     {
-        memcpyBuffers(false, true, false, 0);
+        memcpyBuffers(false, true, false);
     }
 
     //!
     //! \brief Copy the contents of input host buffers to input device buffers asynchronously.
     //!
-    void copyInputToDeviceAsync(const cudaStream_t& stream)
+    void copyInputToDeviceAsync(cudaStream_t const& stream = 0)
     {
         memcpyBuffers(true, false, true, stream);
     }
@@ -426,7 +402,7 @@ class BufferManager
     //!
     //! \brief Copy the contents of output device buffers to output host buffers asynchronously.
     //!
-    void copyOutputToHostAsync(const cudaStream_t& stream)
+    void copyOutputToHostAsync(cudaStream_t const& stream = 0)
     {
         memcpyBuffers(false, true, true, stream);
     }
@@ -434,30 +410,31 @@ class BufferManager
     ~BufferManager() = default;
 
 private:
-    void* getBuffer(const bool isHost, const std::string& tensorName) const
+    void* getBuffer(bool const isHost, std::string const& tensorName) const
     {
-        int index = mEngine->getBindingIndex(tensorName.c_str());
-        if (index == -1)
+        auto record = mNames.find(tensorName);
+        if (record == mNames.end())
             return nullptr;
-        return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data());
+        return (isHost ? mManagedBuffers[record->second]->hostBuffer.data()
+                       : mManagedBuffers[record->second]->deviceBuffer.data());
     }
 
-    void* getBuffer(const bool isHost, int bindingIndex) const
+    bool tenosrIsInput(const std::string& tensorName) const
     {
-        if (bindingIndex == -1)
-            return nullptr;
-        return (isHost ? mManagedBuffers[bindingIndex]->hostBuffer.data() : mManagedBuffers[bindingIndex]->deviceBuffer.data());
+        return mEngine->getTensorIOMode(tensorName.c_str()) == nvinfer1::TensorIOMode::kINPUT;
     }
 
-    void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream)
+    void memcpyBuffers(bool const copyInput, bool const deviceToHost, bool const async, cudaStream_t const& stream = 0)
     {
-        for (int i = 0; i < mEngine->getNbBindings(); i++)
+        for (auto const& n : mNames)
         {
-            void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data();
-            const void* srcPtr = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data();
-            const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes();
+            void* dstPtr = deviceToHost ? mManagedBuffers[n.second]->hostBuffer.data()
+                                        : mManagedBuffers[n.second]->deviceBuffer.data();
+            void const* srcPtr = deviceToHost ? mManagedBuffers[n.second]->deviceBuffer.data()
+                                              : mManagedBuffers[n.second]->hostBuffer.data();
+            size_t const byteSize = mManagedBuffers[n.second]->hostBuffer.nbBytes();
             const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
-            if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i)))
+            if ((copyInput && tenosrIsInput(n.first)) || (!copyInput && !tenosrIsInput(n.first)))
             {
                 if (async)
                     CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
@@ -468,9 +445,10 @@ class BufferManager
     }
 
     std::shared_ptr<nvinfer1::ICudaEngine> mEngine;              //!< The pointer to the engine
-    int mBatchSize = 0;                                          //!< The batch size for legacy networks, 0 otherwise.
+    int mBatchSize;                                              //!< The batch size for legacy networks, 0 otherwise.
     std::vector<std::unique_ptr<ManagedBuffer>> mManagedBuffers; //!< The vector of pointers to managed buffers
-    std::vector<void*> mDeviceBindings;                          //!< The vector of device buffers needed for engine execution
+    std::vector<void*> mDeviceBindings;              //!< The vector of device buffers needed for engine execution
+    std::unordered_map<std::string, int32_t> mNames; //!< The map of tensor name and index pairs
 };
 
 } // namespace samplesCommon
diff --git a/src/Detector/tensorrt_yolo/common/common.h b/src/Detector/tensorrt_yolo/common/common.h
index 2270a2cd..538c6094 100644
--- a/src/Detector/tensorrt_yolo/common/common.h
+++ b/src/Detector/tensorrt_yolo/common/common.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -16,22 +17,13 @@
 
 #ifndef TENSORRT_COMMON_H
 #define TENSORRT_COMMON_H
-
-// For loadLibrary
-#ifdef _MSC_VER
-// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#undef NOMINMAX
-#else
-#include <dlfcn.h>
-#endif
-
 #include "NvInfer.h"
+#if !TRT_WINML
 #include "NvInferPlugin.h"
+#endif
 #include "logger.h"
+#include "safeCommon.h"
+#include "timingCache.h"
 #include <algorithm>
 #include <cassert>
 #include <chrono>
@@ -39,6 +31,7 @@
 #include <cstring>
 #include <cuda_runtime_api.h>
 #include <fstream>
+#include <functional>
 #include <iomanip>
 #include <iostream>
 #include <iterator>
@@ -52,7 +45,15 @@
 #include <utility>
 #include <vector>
 
-#include "safeCommon.h"
+#ifdef _MSC_VER
+// For loadLibrary
+// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#else
+#include <dlfcn.h>
+#endif
 
 #ifdef _MSC_VER
 #define FN_NAME __FUNCTION__
@@ -82,7 +83,7 @@
         if (!(condition))                                                   \
         {                                                                   \
             sample::gLogError << "Assertion failure: " << #condition << std::endl;  \
-            abort();                                                        \
+            exit(EXIT_FAILURE);                                                       \
         }                                                                   \
     } while (0)
 
@@ -96,7 +97,7 @@ OBJ_GUARD(T)
 makeObjGuard(T_* t)
 {
     CHECK(!(std::is_base_of<T, T_>::value || std::is_same<T, T_>::value));
-    auto deleter = [](T* t) { t->destroy(); };
+    auto deleter = [](T* t) { delete t; };
     return std::unique_ptr<T, decltype(deleter)>{static_cast<T*>(t), deleter};
 }
 
@@ -113,21 +114,6 @@ constexpr long double operator"" _KiB(long double val)
     return val * (1 << 10);
 }
 
-// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB.
-// Since the return type is signed, -1_GiB will work as expected.
-constexpr long long int operator"" _GiB(unsigned long long val)
-{
-    return val * (1 << 30);
-}
-constexpr long long int operator"" _MiB(unsigned long long val)
-{
-    return val * (1 << 20);
-}
-constexpr long long int operator"" _KiB(unsigned long long val)
-{
-    return val * (1 << 10);
-}
-
 struct SimpleProfiler : public nvinfer1::IProfiler
 {
     struct Record
@@ -136,7 +122,7 @@ struct SimpleProfiler : public nvinfer1::IProfiler
         int count{0};
     };
 
-    virtual void reportLayerTime(const char* layerName, float ms) noexcept
+    void reportLayerTime(const char* layerName, float ms) noexcept override
     {
         mProfile[layerName].count++;
         mProfile[layerName].time += ms;
@@ -183,7 +169,7 @@ struct SimpleProfiler : public nvinfer1::IProfiler
         auto old_precision = out.precision();
         // Output header
         {
-            out << std::setw(maxLayerNameLength) << layerNameStr << " ";
+            out << std::setfill(' ') << std::setw(maxLayerNameLength) << layerNameStr << " ";
             out << std::setw(12) << "Runtime, "
                 << "%"
                 << " ";
@@ -214,80 +200,12 @@ struct SimpleProfiler : public nvinfer1::IProfiler
     std::map<std::string, Record> mProfile;
 };
 
-//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in.
-//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path.
-inline std::string locateFile(
-    const std::string& filepathSuffix, const std::vector<std::string>& directories, bool reportError = true)
-{
-    const int MAX_DEPTH{10};
-    bool found{false};
-    std::string filepath;
-
-    for (auto& dir : directories)
-    {
-        if (!dir.empty() && dir.back() != '/')
-        {
-#ifdef _MSC_VER
-            filepath = dir + "\\" + filepathSuffix;
-#else
-            filepath = dir + "/" + filepathSuffix;
-#endif
-        }
-        else
-        {
-            filepath = dir + filepathSuffix;
-        }
-
-        for (int i = 0; i < MAX_DEPTH && !found; i++)
-        {
-            const std::ifstream checkFile(filepath);
-            found = checkFile.is_open();
-            if (found)
-            {
-                break;
-            }
-
-            filepath = "../" + filepath; // Try again in parent dir
-        }
-
-        if (found)
-        {
-            break;
-        }
-
-        filepath.clear();
-    }
-
-    // Could not find the file
-    if (filepath.empty())
-    {
-        const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(),
-            [](const std::string& a, const std::string& b) { return a + "\n\t" + b; });
-        std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl;
-
-        if (reportError)
-        {
-            std::cout << "&&&& FAILED" << std::endl;
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return filepath;
-}
-
-inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW)
-{
-    std::ifstream infile(fileName, std::ifstream::binary);
-    assert(infile.is_open() && "Attempting to read from a file that is not open.");
-    std::string magic, h, w, max;
-    infile >> magic >> h >> w >> max;
-    infile.seekg(1, infile.cur);
-    infile.read(reinterpret_cast<char*>(buffer), inH * inW);
-}
-
 namespace samplesCommon
 {
-
+using nvinfer1::utils::loadTimingCacheFile;
+using nvinfer1::utils::buildTimingCacheFromFile;
+using nvinfer1::utils::saveTimingCacheFile;
+using nvinfer1::utils::updateTimingCacheFile;
 // Swaps endianness of an integral type.
 template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
 inline T swapEndianness(const T& value)
@@ -339,7 +257,7 @@ class TypedHostMemory : public HostMemory
     {
         mData = new ElemType[size];
     };
-    ~TypedHostMemory() noexcept
+    ~TypedHostMemory() noexcept override
     {
         delete[](ElemType*) mData;
     }
@@ -360,7 +278,7 @@ inline void* safeCudaMalloc(size_t memSize)
     if (deviceMem == nullptr)
     {
         std::cerr << "Out of memory" << std::endl;
-        exit(1);
+        exit(EXIT_FAILURE);
     }
     return deviceMem;
 }
@@ -375,25 +293,20 @@ struct InferDeleter
     template <typename T>
     void operator()(T* obj) const
     {
-#if (NV_TENSORRT_MAJOR < 8)
-		obj->destroy();
-#else
         delete obj;
-#endif
     }
 };
 
 template <typename T>
-using SampleUniquePtr = std::unique_ptr<T, InferDeleter>;
+using SampleUniquePtr = std::unique_ptr<T>;
 
-static auto StreamDeleter = [](cudaStream_t* pStream)
+static auto StreamDeleter = [](cudaStream_t* pStream) {
+    if (pStream)
     {
-        if (pStream)
-        {
-            cudaStreamDestroy(*pStream);
-            delete pStream;
-        }
-    };
+        static_cast<void>(cudaStreamDestroy(*pStream));
+        delete pStream;
+    }
+};
 
 inline std::unique_ptr<cudaStream_t, decltype(StreamDeleter)> makeCudaStream()
 {
@@ -531,7 +444,7 @@ inline float getMaxValue(const float* buffer, int64_t size)
 //
 // The default parameter values choosen arbitrarily. Range values should be choosen such that
 // we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor.
-inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f)
+inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0F, float outRange = 4.0F)
 {
     // Ensure that all layer inputs have a scale.
     for (int i = 0; i < network->getNbLayers(); i++)
@@ -579,14 +492,15 @@ inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer
     // Set dummy per-tensor dynamic range if Int8 mode is requested.
     if (c->getFlag(nvinfer1::BuilderFlag::kINT8))
     {
-        sample::gLogWarning
-            << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy is not guaranteed."
-            << std::endl;
+        sample::gLogWarning << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy "
+                               "is not guaranteed."
+                            << std::endl;
         setAllDynamicRanges(n);
     }
 }
 
-inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true)
+inline void enableDLA(
+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true)
 {
     if (useDLACore >= 0)
     {
@@ -627,18 +541,28 @@ inline uint32_t getElementSize(nvinfer1::DataType t) noexcept
 {
     switch (t)
     {
-    case nvinfer1::DataType::kINT32: return 4;
+    case nvinfer1::DataType::kINT64: return 8;
+    case nvinfer1::DataType::kINT32:
     case nvinfer1::DataType::kFLOAT: return 4;
+    case nvinfer1::DataType::kBF16:
     case nvinfer1::DataType::kHALF: return 2;
     case nvinfer1::DataType::kBOOL:
-    case nvinfer1::DataType::kINT8: return 1;
+    case nvinfer1::DataType::kUINT8:
+    case nvinfer1::DataType::kINT8:
+    case nvinfer1::DataType::kFP8: return 1;
+    case nvinfer1::DataType::kINT4:
+        ASSERT(false && "Element size is not implemented for sub-byte data-types");
     }
     return 0;
 }
 
-inline int64_t volume(const nvinfer1::Dims& d)
+inline int64_t volume(nvinfer1::Dims const& dims, int32_t start, int32_t stop)
 {
-    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
+    ASSERT(start >= 0);
+    ASSERT(start <= stop);
+    ASSERT(stop <= dims.nbDims);
+    ASSERT(std::all_of(dims.d + start, dims.d + stop, [](int32_t x) { return x >= 0; }));
+    return std::accumulate(dims.d + start, dims.d + stop, int64_t{1}, std::multiplies<int64_t>{});
 }
 
 template <int C, int H, int W>
@@ -698,7 +622,7 @@ void writePPMFileWithBBox(const std::string& filename, PPM<C, H, W>& ppm, const
             << ppm.w << " " << ppm.h << "\n"
             << ppm.max << "\n";
 
-    auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
+    auto round = [](float x) -> int { return int(std::floor(x + 0.5F)); };
     const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1);
     const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1);
     const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1);
@@ -739,7 +663,7 @@ inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vec
             << "\n"
             << ppm.w << " " << ppm.h << "\n"
             << ppm.max << "\n";
-    auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
+    auto round = [](float x) -> int { return int(std::floor(x + 0.5F)); };
 
     for (auto bbox : dets)
     {
@@ -778,7 +702,7 @@ class TimerBase
     virtual void stop() {}
     float microseconds() const noexcept
     {
-        return mMs * 1000.f;
+        return mMs * 1000.F;
     }
     float milliseconds() const noexcept
     {
@@ -786,15 +710,15 @@ class TimerBase
     }
     float seconds() const noexcept
     {
-        return mMs / 1000.f;
+        return mMs / 1000.F;
     }
     void reset() noexcept
     {
-        mMs = 0.f;
+        mMs = 0.F;
     }
 
 protected:
-    float mMs{0.0f};
+    float mMs{0.0F};
 };
 
 class GpuTimer : public TimerBase
@@ -811,14 +735,14 @@ class GpuTimer : public TimerBase
         CHECK(cudaEventDestroy(mStart));
         CHECK(cudaEventDestroy(mStop));
     }
-    void start()
+    void start() override
     {
         CHECK(cudaEventRecord(mStart, mStream));
     }
-    void stop()
+    void stop() override
     {
         CHECK(cudaEventRecord(mStop, mStream));
-        float ms{0.0f};
+        float ms{0.0F};
         CHECK(cudaEventSynchronize(mStop));
         CHECK(cudaEventElapsedTime(&ms, mStart, mStop));
         mMs += ms;
@@ -835,11 +759,11 @@ class CpuTimer : public TimerBase
 public:
     using clock_type = Clock;
 
-    void start()
+    void start() override
     {
         mStart = Clock::now();
     }
-    void stop()
+    void stop() override
     {
         mStop = Clock::now();
         mMs += std::chrono::duration<float, std::milli>{mStop - mStart}.count();
@@ -865,13 +789,7 @@ inline std::vector<std::string> splitString(std::string str, char delimiter = ',
     return splitVect;
 }
 
-// Return m rounded up to nearest multiple of n
-inline int roundUp(int m, int n)
-{
-    return ((m + n - 1) / n) * n;
-}
-
-inline int getC(const nvinfer1::Dims& d)
+inline int getC(nvinfer1::Dims const& d)
 {
     return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1;
 }
@@ -886,54 +804,111 @@ inline int getW(const nvinfer1::Dims& d)
     return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1;
 }
 
-inline void loadLibrary(const std::string& path)
+//! Platform-agnostic wrapper around dynamic libraries.
+class DynamicLibrary
 {
-#ifdef _MSC_VER
-    void* handle = LoadLibrary(path.c_str());
-#else
-    int32_t flags{RTLD_LAZY};
+public:
+    explicit DynamicLibrary(std::string const& name)
+        : mLibName{name}
+    {
+#if defined(_WIN32)
+        mHandle = LoadLibraryA(name.c_str());
+#else // defined(_WIN32)
+        int32_t flags{RTLD_LAZY};
 #if ENABLE_ASAN
-    // https://github.com/google/sanitizers/issues/89
-    // asan doesn't handle module unloading correctly and there are no plans on doing
-    // so. In order to get proper stack traces, don't delete the shared library on
-    // close so that asan can resolve the symbols correctly.
-    flags |= RTLD_NODELETE;
+        // https://github.com/google/sanitizers/issues/89
+        // asan doesn't handle module unloading correctly and there are no plans on doing
+        // so. In order to get proper stack traces, don't delete the shared library on
+        // close so that asan can resolve the symbols correctly.
+        flags |= RTLD_NODELETE;
 #endif // ENABLE_ASAN
 
-    void* handle = dlopen(path.c_str(), flags);
+        mHandle = dlopen(name.c_str(), flags);
+#endif // defined(_WIN32)
+
+        if (mHandle == nullptr)
+        {
+            std::string errorStr{};
+#if !defined(_WIN32)
+            errorStr = std::string{" due to "} + std::string{dlerror()};
 #endif
-    if (handle == nullptr)
+            throw std::runtime_error("Unable to open library: " + name + errorStr);
+        }
+    }
+
+    DynamicLibrary(DynamicLibrary const&) = delete;
+    DynamicLibrary(DynamicLibrary const&&) = delete;
+
+    //!
+    //! Retrieve a function symbol from the loaded library.
+    //!
+    //! \return the loaded symbol on success
+    //! \throw std::invalid_argument if loading the symbol failed.
+    //!
+    template <typename Signature>
+    std::function<Signature> symbolAddress(char const* name)
     {
-#ifdef _MSC_VER
-        sample::gLogError << "Could not load plugin library: " << path << std::endl;
+        if (mHandle == nullptr)
+        {
+            throw std::runtime_error("Handle to library is nullptr.");
+        }
+        void* ret;
+#if defined(_MSC_VER)
+        ret = static_cast<void*>(GetProcAddress(static_cast<HMODULE>(mHandle), name));
 #else
-        sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl;
+        ret = dlsym(mHandle, name);
 #endif
+        if (ret == nullptr)
+        {
+            std::string const kERROR_MSG(mLibName + ": error loading symbol: " + std::string(name));
+            throw std::invalid_argument(kERROR_MSG);
+        }
+        return reinterpret_cast<Signature*>(ret);
     }
-}
 
-inline int32_t getSMVersion()
-{
-    int32_t deviceIndex = 0;
-    CHECK(cudaGetDevice(&deviceIndex));
+    ~DynamicLibrary()
+    {
+        try
+        {
+#if defined(_WIN32)
+            ASSERT(static_cast<bool>(FreeLibrary(static_cast<HMODULE>(mHandle))));
+#else
+            ASSERT(dlclose(mHandle) == 0);
+#endif
+        }
+        catch (...)
+        {
+            sample::gLogError << "Unable to close library: " << mLibName << std::endl;
+        }
+    }
 
-    int32_t major, minor;
-    CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex));
-    CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex));
+private:
+    std::string mLibName{}; //!< Name of the DynamicLibrary
+    void* mHandle{};        //!< Handle to the DynamicLibrary
+};
 
-    return ((major << 8) | minor);
+inline std::unique_ptr<DynamicLibrary> loadLibrary(std::string const& path)
+{
+    // make_unique not available until C++14 - we still need to support C++11 builds.
+    return std::unique_ptr<DynamicLibrary>(new DynamicLibrary{path});
 }
 
-inline bool isSMSafe()
+inline int32_t getMaxPersistentCacheSize()
 {
-    const int32_t smVersion = getSMVersion();
-    return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 ||
-           smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807;
+    int32_t deviceIndex{};
+    CHECK(cudaGetDevice(&deviceIndex));
+
+    int32_t maxPersistentL2CacheSize{};
+#if CUDART_VERSION >= 11030 && !TRT_WINML
+    CHECK(cudaDeviceGetAttribute(&maxPersistentL2CacheSize, cudaDevAttrMaxPersistingL2CacheSize, deviceIndex));
+#endif
+
+    return maxPersistentL2CacheSize;
 }
 
 inline bool isDataTypeSupported(nvinfer1::DataType dataType)
 {
-    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
+    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(createBuilder());
     if (!builder)
     {
         return false;
@@ -947,7 +922,6 @@ inline bool isDataTypeSupported(nvinfer1::DataType dataType)
 
     return true;
 }
-
 } // namespace samplesCommon
 
 inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
diff --git a/src/Detector/tensorrt_yolo/common/dumpTFWts.py b/src/Detector/tensorrt_yolo/common/dumpTFWts.py
new file mode 100644
index 00000000..70770fbd
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/dumpTFWts.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Script to dump TensorFlow weights in TRT v1 and v2 dump format.
+# The V1 format is for TensorRT 4.0. The V2 format is for TensorRT 4.0 and later.
+
+import sys
+import struct
+import argparse
+
+try:
+    import tensorflow as tf
+    from tensorflow.python import pywrap_tensorflow
+except ImportError as err:
+    sys.stderr.write("""Error: Failed to import module ({})""".format(err))
+    sys.exit()
+
+parser = argparse.ArgumentParser(description="TensorFlow Weight Dumper")
+
+parser.add_argument(
+    "-m",
+    "--model",
+    required=True,
+    help="The checkpoint file basename, example basename(model.ckpt-766908.data-00000-of-00001) -> model.ckpt-766908",
+)
+parser.add_argument("-o", "--output", required=True, help="The weight file to dump all the weights to.")
+parser.add_argument("-1", "--wtsv1", required=False, default=False, type=bool, help="Dump the weights in the wts v1.")
+
+opt = parser.parse_args()
+
+if opt.wtsv1:
+    print("Outputting the trained weights in TensorRT's wts v1 format. This format is documented as:")
+    print("Line 0: <number of buffers in the file>")
+    print("Line 1-Num: [buffer name] [buffer type] [buffer size] <hex values>")
+else:
+    print("Outputting the trained weights in TensorRT's wts v2 format. This format is documented as:")
+    print("Line 0: <number of buffers in the file>")
+    print("Line 1-Num: [buffer name] [buffer type] [(buffer shape{e.g. (1, 2, 3)}] <buffer shaped size bytes of data>")
+
+inputbase = opt.model
+outputbase = opt.output
+
+
+def float_to_hex(f):
+    return hex(struct.unpack("<I", struct.pack("<f", f))[0])
+
+
+def getTRTType(tensor):
+    if tf.as_dtype(tensor.dtype) == tf.float32:
+        return 0
+    if tf.as_dtype(tensor.dtype) == tf.float16:
+        return 1
+    if tf.as_dtype(tensor.dtype) == tf.int8:
+        return 2
+    if tf.as_dtype(tensor.dtype) == tf.int32:
+        return 3
+    print("Tensor data type of %s is not supported in TensorRT" % (tensor.dtype))
+    sys.exit()
+
+
+try:
+    # Open output file
+    if opt.wtsv1:
+        outputFileName = outputbase + ".wts"
+    else:
+        outputFileName = outputbase + ".wts2"
+    outputFile = open(outputFileName, "w")
+
+    # read vars from checkpoint
+    reader = pywrap_tensorflow.NewCheckpointReader(inputbase)
+    var_to_shape_map = reader.get_variable_to_shape_map()
+
+    # Record count of weights
+    count = 0
+    for key in sorted(var_to_shape_map):
+        count += 1
+    outputFile.write("%s\n" % (count))
+
+    # Dump the weights in either v1 or v2 format
+    for key in sorted(var_to_shape_map):
+        tensor = reader.get_tensor(key)
+        file_key = key.replace("/", "_")
+        typeOfElem = getTRTType(tensor)
+        val = tensor.shape
+        if opt.wtsv1:
+            val = tensor.size
+        print("%s %s %s " % (file_key, typeOfElem, val))
+        flat_tensor = tensor.flatten()
+        outputFile.write("%s 0 %s " % (file_key, val))
+        if opt.wtsv1:
+            for weight in flat_tensor:
+                hexval = float_to_hex(float(weight))
+                outputFile.write("%s " % (hexval[2:]))
+        else:
+            outputFile.write(flat_tensor.tobytes())
+        outputFile.write("\n")
+    outputFile.close()
+
+except Exception as e:  # pylint: disable=broad-except
+    print(str(e))
+    if "corrupted compressed block contents" in str(e):
+        print("It's likely that your checkpoint file has been compressed " "with SNAPPY.")
+        if "Data loss" in str(e) and (any([e in inputbase for e in [".index", ".meta", ".data"]])):
+            proposed_file = ".".join(inputbase.split(".")[0:-1])
+            v2_file_error_template = """
+           It's likely that this is a V2 checkpoint and you need to provide the filename
+           *prefix*.  Try removing the '.' and extension.  Try:
+           inspect checkpoint --file_name = {}"""
+            print(v2_file_error_template.format(proposed_file))
diff --git a/src/Detector/tensorrt_yolo/common/fileLock.cpp b/src/Detector/tensorrt_yolo/common/fileLock.cpp
new file mode 100644
index 00000000..e155c0bd
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/fileLock.cpp
@@ -0,0 +1,100 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "fileLock.h"
+#include "NvInfer.h"
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace nvinfer1
+{
+namespace utils
+{
+FileLock::FileLock(ILogger& logger, std::string const& fileName)
+    : mLogger(logger)
+    , mFileName(fileName)
+{
+    std::string lockFileName = mFileName + ".lock";
+#ifdef _MSC_VER
+    {
+        std::stringstream ss;
+        ss << "Trying to set exclusive file lock " << lockFileName << std::endl;
+        mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str());
+    }
+    // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided
+    mHandle = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL);
+    if (mHandle == INVALID_HANDLE_VALUE)
+    {
+        throw std::runtime_error("Failed to lock " + lockFileName + "!");
+    }
+#elif defined(__QNX__)
+    // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is
+    // The error generated was 89, which means that the function is not implemented.
+#else
+    mHandle = fopen(lockFileName.c_str(), "wb+");
+    if (mHandle == nullptr)
+    {
+        throw std::runtime_error("Cannot open " + lockFileName + "!");
+    }
+    {
+        std::stringstream ss;
+        ss << "Trying to set exclusive file lock " << lockFileName << std::endl;
+        mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str());
+    }
+    mDescriptor = fileno(mHandle);
+    auto ret = lockf(mDescriptor, F_LOCK, 0);
+    if (ret != 0)
+    {
+        mDescriptor = -1;
+        fclose(mHandle);
+        throw std::runtime_error("Failed to lock " + lockFileName + "!");
+    }
+#endif
+}
+
+FileLock::~FileLock()
+{
+    std::string lockFileName = mFileName + ".lock";
+#ifdef _MSC_VER
+    if (mHandle != INVALID_HANDLE_VALUE)
+    {
+        CloseHandle(mHandle);
+    }
+#elif defined(__QNX__)
+    // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is
+    // The error generated was 89
+    // That means : Function not implemented
+#else
+    if (mDescriptor != -1)
+    {
+        auto ret = lockf(mDescriptor, F_ULOCK, 0);
+        if (mHandle != nullptr)
+        {
+            fclose(mHandle);
+        }
+        if (ret != 0)
+        {
+            std::stringstream ss;
+            ss << "Failed to unlock " << lockFileName << ", please remove " << lockFileName << ".lock manually!"
+               << std::endl;
+            mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str());
+        }
+    }
+#endif
+}
+} // namespace utils
+} // namespace nvinfer1
diff --git a/src/Detector/tensorrt_yolo/common/fileLock.h b/src/Detector/tensorrt_yolo/common/fileLock.h
new file mode 100644
index 00000000..d0f64a5b
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/fileLock.h
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_SAMPLES_COMMON_FILELOCK_H_
+#define TENSORRT_SAMPLES_COMMON_FILELOCK_H_
+#include "NvInfer.h"
+#ifdef _MSC_VER
+// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#else
+#include <stdio.h>  // fileno
+#include <unistd.h> // lockf
+#endif
+#include <string>
+
+namespace nvinfer1
+{
+namespace utils
+{
+//!
+//! \brief RAII object that locks a the specified file.
+//!
+//! The FileLock class uses a lock file to specify that the
+//! current file is being used by a TensorRT tool or sample
+//! so that things like the TimingCache can be updated across
+//! processes without having conflicts.
+//!
+class FileLock
+{
+public:
+    FileLock(nvinfer1::ILogger& logger, std::string const& fileName);
+    ~FileLock();
+    FileLock() = delete;                           // no default ctor
+    FileLock(FileLock const&) = delete;            // no copy ctor
+    FileLock& operator=(FileLock const&) = delete; // no copy assignment
+    FileLock(FileLock&&) = delete;                 // no move ctor
+    FileLock& operator=(FileLock&&) = delete;      // no move assignment
+
+private:
+    //!
+    //! The logger that emits any error messages that might show up.
+    //!
+    nvinfer1::ILogger& mLogger;
+
+    //!
+    //! The filename that the FileLock is protecting from multiple
+    //! TensorRT processes from writing to.
+    //!
+    std::string const mFileName;
+
+#ifdef _MSC_VER
+    //!
+    //! The file handle on windows for the file lock.
+    //!
+    HANDLE mHandle{};
+#else
+    //!
+    //! The file handle on linux for the file lock.
+    //!
+    FILE* mHandle{};
+    //!
+    //! The file descriptor on linux of the file lock.
+    //!
+    int32_t mDescriptor{-1};
+#endif
+}; // class FileLock
+} // namespace utils
+} // namespace nvinfer1
+
+#endif // TENSORRT_SAMPLES_COMMON_FILELOCK_H_
diff --git a/src/Detector/tensorrt_yolo/common/getOptions.cpp b/src/Detector/tensorrt_yolo/common/getOptions.cpp
new file mode 100644
index 00000000..19cd3281
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/getOptions.cpp
@@ -0,0 +1,248 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "getOptions.h"
+#include "logger.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstring>
+#include <set>
+
+namespace nvinfer1
+{
+namespace utility
+{
+
+//! Matching for TRTOptions is defined as follows:
+//!
+//! If A and B both have longName set, A matches B if and only if A.longName ==
+//! B.longName and (A.shortName == B.shortName if both have short name set).
+//!
+//! If A only has shortName set and B only has longName set, then A does not
+//! match B. It is assumed that when 2 TRTOptions are compared, one of them is
+//! the definition of a TRTOption in the input to getOptions. As such, if the
+//! definition only has shortName set, it will never be equal to a TRTOption
+//! that does not have shortName set (and same for longName).
+//!
+//! If A and B both have shortName set but B does not have longName set, A
+//! matches B if and only if A.shortName == B.shortName.
+//!
+//! If A has neither long or short name set, A matches B if and only if B has
+//! neither long or short name set.
+bool matches(const TRTOption& a, const TRTOption& b)
+{
+    if (!a.longName.empty() && !b.longName.empty())
+    {
+        if (a.shortName && b.shortName)
+        {
+            return (a.longName == b.longName) && (a.shortName == b.shortName);
+        }
+        return a.longName == b.longName;
+    }
+
+    // If only one of them is not set, this will return false anyway.
+    return a.shortName == b.shortName;
+}
+
+//! getTRTOptionIndex returns the index of a TRTOption in a vector of
+//! TRTOptions, -1 if not found.
+int getTRTOptionIndex(const std::vector<TRTOption>& options, const TRTOption& opt)
+{
+    for (size_t i = 0; i < options.size(); ++i)
+    {
+        if (matches(opt, options[i]))
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
+//! validateTRTOption will return a string containing an error message if options
+//! contain non-numeric characters, or if there are duplicate option names found.
+//! Otherwise, returns the empty string.
+std::string validateTRTOption(
+    const std::set<char>& seenShortNames, const std::set<std::string>& seenLongNames, const TRTOption& opt)
+{
+    if (opt.shortName != 0)
+    {
+        if (!std::isalnum(opt.shortName))
+        {
+            return "Short name '" + std::to_string(opt.shortName) + "' is non-alphanumeric";
+        }
+
+        if (seenShortNames.find(opt.shortName) != seenShortNames.end())
+        {
+            return "Short name '" + std::to_string(opt.shortName) + "' is a duplicate";
+        }
+    }
+
+    if (!opt.longName.empty())
+    {
+        for (const char& c : opt.longName)
+        {
+            if (!std::isalnum(c) && c != '-' && c != '_')
+            {
+                return "Long name '" + opt.longName + "' contains characters that are not '-', '_', or alphanumeric";
+            }
+        }
+
+        if (seenLongNames.find(opt.longName) != seenLongNames.end())
+        {
+            return "Long name '" + opt.longName + "' is a duplicate";
+        }
+    }
+    return "";
+}
+
+//! validateTRTOptions will return a string containing an error message if any
+//! options contain non-numeric characters, or if there are duplicate option
+//! names found. Otherwise, returns the empty string.
+std::string validateTRTOptions(const std::vector<TRTOption>& options)
+{
+    std::set<char> seenShortNames;
+    std::set<std::string> seenLongNames;
+    for (size_t i = 0; i < options.size(); ++i)
+    {
+        const std::string errMsg = validateTRTOption(seenShortNames, seenLongNames, options[i]);
+        if (!errMsg.empty())
+        {
+            return "Error '" + errMsg + "' at TRTOption " + std::to_string(i);
+        }
+
+        seenShortNames.insert(options[i].shortName);
+        seenLongNames.insert(options[i].longName);
+    }
+    return "";
+}
+
+//! parseArgs parses an argument list and returns a TRTParsedArgs with the
+//! fields set accordingly. Assumes that options is validated.
+//! ErrMsg will be set if:
+//!     - an argument is null
+//!     - an argument is empty
+//!     - an argument does not have option (i.e. "-" and "--")
+//!     - a short argument has more than 1 character
+//!     - the last argument in the list requires a value
+TRTParsedArgs parseArgs(int argc, const char* const* argv, const std::vector<TRTOption>& options)
+{
+    TRTParsedArgs parsedArgs;
+    parsedArgs.values.resize(options.size());
+
+    for (int i = 1; i < argc; ++i) // index of current command-line argument
+    {
+        if (argv[i] == nullptr)
+        {
+            return TRTParsedArgs{"Null argument at index " + std::to_string(i)};
+        }
+
+        const std::string argStr(argv[i]);
+        if (argStr.empty())
+        {
+            return TRTParsedArgs{"Empty argument at index " + std::to_string(i)};
+        }
+
+        // No starting hyphen means it is a positional argument
+        if (argStr[0] != '-')
+        {
+            parsedArgs.positionalArgs.push_back(argStr);
+            continue;
+        }
+
+        if (argStr == "-" || argStr == "--")
+        {
+            return TRTParsedArgs{"Argument does not specify an option at index " + std::to_string(i)};
+        }
+
+        // If only 1 hyphen, char after is the flag.
+        TRTOption opt{' ', "", false, ""};
+        std::string value;
+        if (argStr[1] != '-')
+        {
+            // Must only have 1 char after the hyphen
+            if (argStr.size() > 2)
+            {
+                return TRTParsedArgs{"Short arg contains more than 1 character at index " + std::to_string(i)};
+            }
+            opt.shortName = argStr[1];
+        }
+        else
+        {
+            opt.longName = argStr.substr(2);
+
+            // We need to support --foo=bar syntax, so look for '='
+            const size_t eqIndex = opt.longName.find('=');
+            if (eqIndex < opt.longName.size())
+            {
+                value = opt.longName.substr(eqIndex + 1);
+                opt.longName = opt.longName.substr(0, eqIndex);
+            }
+        }
+
+        const int idx = getTRTOptionIndex(options, opt);
+        if (idx < 0)
+        {
+            continue;
+        }
+
+        if (options[idx].valueRequired)
+        {
+            if (!value.empty())
+            {
+                parsedArgs.values[idx].second.push_back(value);
+                parsedArgs.values[idx].first = parsedArgs.values[idx].second.size();
+                continue;
+            }
+
+            if (i + 1 >= argc)
+            {
+                return TRTParsedArgs{"Last argument requires value, but none given"};
+            }
+
+            const std::string nextArg(argv[i + 1]);
+            if (nextArg.size() >= 1 && nextArg[0] == '-')
+            {
+                sample::gLogWarning << "Warning: Using '" << nextArg << "' as a value for '" << argStr
+                                    << "', Should this be its own flag?" << std::endl;
+            }
+
+            parsedArgs.values[idx].second.push_back(nextArg);
+            i += 1; // Next argument already consumed
+
+            parsedArgs.values[idx].first = parsedArgs.values[idx].second.size();
+        }
+        else
+        {
+            parsedArgs.values[idx].first += 1;
+        }
+    }
+    return parsedArgs;
+}
+
+TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector<TRTOption>& options)
+{
+    const std::string errMsg = validateTRTOptions(options);
+    if (!errMsg.empty())
+    {
+        return TRTParsedArgs{errMsg};
+    }
+    return parseArgs(argc, argv, options);
+}
+} // namespace utility
+} // namespace nvinfer1
diff --git a/src/Detector/tensorrt_yolo/common/getOptions.h b/src/Detector/tensorrt_yolo/common/getOptions.h
new file mode 100644
index 00000000..4bbf9e27
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/getOptions.h
@@ -0,0 +1,128 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_GET_OPTIONS_H
+#define TRT_GET_OPTIONS_H
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace nvinfer1
+{
+namespace utility
+{
+
+//! TRTOption defines a command line option. At least 1 of shortName and longName
+//! must be defined.
+//! If bool initialization is undefined behavior on your system, valueRequired
+//! must also be explicitly defined.
+//! helpText is optional.
+struct TRTOption
+{
+    char shortName;       //!< Option name in short (single hyphen) form (i.e. -a, -b)
+    std::string longName; //!< Option name in long (double hyphen) form (i.e. --foo, --bar)
+    bool valueRequired;   //!< True if a value is needed for an option (i.e. -N 4, --foo bar)
+    std::string helpText; //!< Text to show when printing out the command usage
+};
+
+//! TRTParsedArgs is returned by getOptions after it has parsed a command line
+//! argument list (argv).
+//!
+//! errMsg is a string containing an error message if any errors occurred. If it
+//! is empty, no errors occurred.
+//!
+//! values stores a vector of pairs for each option (ordered by order in the
+//! input). Each pair contains an int (the number of occurrences) and a vector
+//! of strings (a list of values). The user should know which of these to use,
+//! and which options required values. For non-value options, only occurrences is
+//! populated. For value-required options, occurrences == # of values. Values do
+//! not need to be unique.
+//!
+//! positionalArgs stores additional arguments that are passed in without an
+//! option (these must not start with a hyphen).
+struct TRTParsedArgs
+{
+    std::string errMsg;
+    std::vector<std::pair<int, std::vector<std::string>>> values;
+    std::vector<std::string> positionalArgs;
+};
+
+//! Parse the input arguments passed to main() and extract options as well as
+//! positional arguments.
+//!
+//! Options are supposed to be passed to main() with a preceding hyphen '-'.
+//!
+//! If there is a single preceding hyphen, there should be exactly 1 character
+//! after the hyphen, which is interpreted as the option.
+//!
+//! If there are 2 preceding hyphens, the entire argument (without the hyphens)
+//! is interpreted as the option.
+//!
+//! If the option requires a value, the next argument is used as the value.
+//!
+//! Positional arguments must not start with a hyphen.
+//!
+//! If an argument requires a value, the next argument is interpreted as the
+//! value, even if it is the form of a valid option (i.e. --foo --bar will store
+//! "--bar" as a value for option "foo" if "foo" requires a value).
+//! We also support --name=value syntax. In this case, 'value' would be used as
+//! the value, NOT the next argument.
+//!
+//! For options:
+//!   { { 'a', "", false },
+//!     { 'b', "", false },
+//!     { 0, "cee", false },
+//!     { 'd', "", true },
+//!     { 'e', "", true },
+//!     { 'f', "foo", true } }
+//!
+//! ./main hello world -a -a --cee -d 12 -f 34
+//! and
+//! ./main hello world -a -a --cee -d 12 --foo 34
+//!
+//! will result in:
+//!
+//! TRTParsedArgs {
+//!      errMsg: "",
+//!      values: { { 2, {} },
+//!                { 0, {} },
+//!                { 1, {} },
+//!                { 1, {"12"} },
+//!                { 0, {} },
+//!                { 1, {"34"} } }
+//!      positionalArgs: {"hello", "world"},
+//! }
+//!
+//! Non-POSIX behavior:
+//!      - Does not support "-abcde" as a shorthand for "-a -b -c -d -e". Each
+//!        option must have its own hyphen prefix.
+//!      - Does not support -e12 as a shorthand for "-e 12". Values MUST be
+//!        whitespace-separated from the option it is for.
+//!
+//! @param[in] argc The number of arguments passed to main (including the
+//!            file name, which is disregarded)
+//! @param[in] argv The arguments passed to main (including the file name,
+//!            which is disregarded)
+//! @param[in] options List of TRTOptions to parse
+//! @return TRTParsedArgs. See TRTParsedArgs documentation for descriptions of
+//!         the fields.
+TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector<TRTOption>& options);
+} // namespace utility
+} // namespace nvinfer1
+
+#endif // TRT_GET_OPTIONS_H
diff --git a/src/Detector/tensorrt_yolo/common/getopt.c b/src/Detector/tensorrt_yolo/common/getopt.c
new file mode 100644
index 00000000..c1da08b5
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/getopt.c
@@ -0,0 +1,568 @@
+/*	$OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $	*/
+/*	$NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $	*/
+
+/*
+ * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F39502-99-1-0512.
+ */
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "getoptWin.h"
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <windows.h>
+
+#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */
+
+#ifdef REPLACE_GETOPT
+int opterr = 1;   /* if error message should be printed */
+int optind = 1;   /* index into parent argv vector */
+int optopt = '?'; /* character checked for validity */
+#undef optreset   /* see getopt.h */
+#define optreset __mingw_optreset
+int optreset; /* reset getopt */
+char* optarg; /* argument associated with option */
+#endif
+
+#define PRINT_ERROR ((opterr) && (*options != ':'))
+
+#define FLAG_PERMUTE 0x01  /* permute non-options to the end of argv */
+#define FLAG_ALLARGS 0x02  /* treat non-options as args to option "-1" */
+#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */
+
+/* return values */
+#define BADCH (int) '?'
+#define BADARG ((*options == ':') ? (int) ':' : (int) '?')
+#define INORDER (int) 1
+
+#ifndef __CYGWIN__
+#define __progname __argv[0]
+#else
+extern char __declspec(dllimport) * __progname;
+#endif
+
+#ifdef __CYGWIN__
+static char EMSG[] = "";
+#else
+#define EMSG ""
+#endif
+
+static int getopt_internal(int, char* const*, char const*, const struct option*, int*, int);
+static int parse_long_options(char* const*, char const*, const struct option*, int*, int);
+static int gcd(int, int);
+static void permute_args(int, int, int, char* const*);
+
+static char* place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1;   /* first option after non options (for permute) */
+
+/* Error messages */
+static char const recargchar[] = "option requires an argument -- %c";
+static char const recargstring[] = "option requires an argument -- %s";
+static char const ambig[] = "ambiguous option -- %.*s";
+static char const noarg[] = "option doesn't take an argument -- %.*s";
+static char const illoptchar[] = "unknown option -- %c";
+static char const illoptstring[] = "unknown option -- %s";
+
+static void _vwarnx(char const* fmt, va_list ap)
+{
+    (void) fprintf(stderr, "%s: ", __progname);
+    if (fmt != NULL)
+        (void) vfprintf(stderr, fmt, ap);
+    (void) fprintf(stderr, "\n");
+}
+
+static void warnx(char const* fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    _vwarnx(fmt, ap);
+    va_end(ap);
+}
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int gcd(int a, int b)
+{
+    int c;
+
+    c = a % b;
+    while (c != 0)
+    {
+        a = b;
+        b = c;
+        c = a % b;
+    }
+
+    return (b);
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char* const* nargv)
+{
+    int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+    char* swap;
+
+    /*
+     * compute lengths of blocks and number and size of cycles
+     */
+    nnonopts = panonopt_end - panonopt_start;
+    nopts = opt_end - panonopt_end;
+    ncycle = gcd(nnonopts, nopts);
+    cyclelen = (opt_end - panonopt_start) / ncycle;
+
+    for (i = 0; i < ncycle; i++)
+    {
+        cstart = panonopt_end + i;
+        pos = cstart;
+        for (j = 0; j < cyclelen; j++)
+        {
+            if (pos >= panonopt_end)
+                pos -= nnonopts;
+            else
+                pos += nopts;
+            swap = nargv[pos];
+            /* LINTED const cast */
+            ((char**) nargv)[pos] = nargv[cstart];
+            /* LINTED const cast */
+            ((char**) nargv)[cstart] = swap;
+        }
+    }
+}
+
+/*
+ * parse_long_options --
+ *	Parse long options in argc/argv argument vector.
+ * Returns -1 if short_too is set and the option does not match long_options.
+ */
+static int parse_long_options(
+    char* const* nargv, char const* options, const struct option* long_options, int* idx, int short_too)
+{
+    char *current_argv, *has_equal;
+    size_t current_argv_len;
+    int i, ambiguous, match;
+
+#define IDENTICAL_INTERPRETATION(_x, _y)                                                                               \
+    (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag    \
+        && long_options[(_x)].val == long_options[(_y)].val)
+
+    current_argv = place;
+    match = -1;
+    ambiguous = 0;
+
+    optind++;
+
+    if ((has_equal = strchr(current_argv, '=')) != NULL)
+    {
+        /* argument found (--option=arg) */
+        current_argv_len = has_equal - current_argv;
+        has_equal++;
+    }
+    else
+        current_argv_len = strlen(current_argv);
+
+    for (i = 0; long_options[i].name; i++)
+    {
+        /* find matching long option */
+        if (strncmp(current_argv, long_options[i].name, current_argv_len))
+            continue;
+
+        if (strlen(long_options[i].name) == current_argv_len)
+        {
+            /* exact match */
+            match = i;
+            ambiguous = 0;
+            break;
+        }
+        /*
+         * If this is a known short option, don't allow
+         * a partial match of a single character.
+         */
+        if (short_too && current_argv_len == 1)
+            continue;
+
+        if (match == -1) /* partial match */
+            match = i;
+        else if (!IDENTICAL_INTERPRETATION(i, match))
+            ambiguous = 1;
+    }
+    if (ambiguous)
+    {
+        /* ambiguous abbreviation */
+        if (PRINT_ERROR)
+            warnx(ambig, (int) current_argv_len, current_argv);
+        optopt = 0;
+        return (BADCH);
+    }
+    if (match != -1)
+    { /* option found */
+        if (long_options[match].has_arg == no_argument && has_equal)
+        {
+            if (PRINT_ERROR)
+                warnx(noarg, (int) current_argv_len, current_argv);
+            /*
+             * XXX: GNU sets optopt to val regardless of flag
+             */
+            if (long_options[match].flag == NULL)
+                optopt = long_options[match].val;
+            else
+                optopt = 0;
+            return (BADARG);
+        }
+        if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument)
+        {
+            if (has_equal)
+                optarg = has_equal;
+            else if (long_options[match].has_arg == required_argument)
+            {
+                /*
+                 * optional argument doesn't use next nargv
+                 */
+                optarg = nargv[optind++];
+            }
+        }
+        if ((long_options[match].has_arg == required_argument) && (optarg == NULL))
+        {
+            /*
+             * Missing argument; leading ':' indicates no error
+             * should be generated.
+             */
+            if (PRINT_ERROR)
+                warnx(recargstring, current_argv);
+            /*
+             * XXX: GNU sets optopt to val regardless of flag
+             */
+            if (long_options[match].flag == NULL)
+                optopt = long_options[match].val;
+            else
+                optopt = 0;
+            --optind;
+            return (BADARG);
+        }
+    }
+    else
+    { /* unknown option */
+        if (short_too)
+        {
+            --optind;
+            return (-1);
+        }
+        if (PRINT_ERROR)
+            warnx(illoptstring, current_argv);
+        optopt = 0;
+        return (BADCH);
+    }
+    if (idx)
+        *idx = match;
+    if (long_options[match].flag)
+    {
+        *long_options[match].flag = long_options[match].val;
+        return (0);
+    }
+    else
+        return (long_options[match].val);
+#undef IDENTICAL_INTERPRETATION
+}
+
+/*
+ * getopt_internal --
+ *	Parse argc/argv argument vector.  Called by user level routines.
+ */
+static int getopt_internal(
+    int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx, int flags)
+{
+    char const* oli; /* option letter list index */
+    int optchar, short_too;
+    static int posixly_correct = -1;
+
+    if (options == NULL)
+        return (-1);
+
+    /*
+     * XXX Some GNU programs (like cvs) set optind to 0 instead of
+     * XXX using optreset.  Work around this braindamage.
+     */
+    if (optind == 0)
+        optind = optreset = 1;
+
+    /*
+     * Disable GNU extensions if POSIXLY_CORRECT is set or options
+     * string begins with a '+'.
+     *
+     * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or
+     *                 optreset != 0 for GNU compatibility.
+     */
+    if (posixly_correct == -1 || optreset != 0)
+        posixly_correct = (getenv("POSIXLY_CORRECT") != NULL);
+    if (*options == '-')
+        flags |= FLAG_ALLARGS;
+    else if (posixly_correct || *options == '+')
+        flags &= ~FLAG_PERMUTE;
+    if (*options == '+' || *options == '-')
+        options++;
+
+    optarg = NULL;
+    if (optreset)
+        nonopt_start = nonopt_end = -1;
+start:
+    if (optreset || !*place)
+    { /* update scanning pointer */
+        optreset = 0;
+        if (optind >= nargc)
+        { /* end of argument vector */
+            place = EMSG;
+            if (nonopt_end != -1)
+            {
+                /* do permutation, if we have to */
+                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                optind -= nonopt_end - nonopt_start;
+            }
+            else if (nonopt_start != -1)
+            {
+                /*
+                 * If we skipped non-options, set optind
+                 * to the first of them.
+                 */
+                optind = nonopt_start;
+            }
+            nonopt_start = nonopt_end = -1;
+            return (-1);
+        }
+        if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL))
+        {
+            place = EMSG; /* found non-option */
+            if (flags & FLAG_ALLARGS)
+            {
+                /*
+                 * GNU extension:
+                 * return non-option as argument to option 1
+                 */
+                optarg = nargv[optind++];
+                return (INORDER);
+            }
+            if (!(flags & FLAG_PERMUTE))
+            {
+                /*
+                 * If no permutation wanted, stop parsing
+                 * at first non-option.
+                 */
+                return (-1);
+            }
+            /* do permutation */
+            if (nonopt_start == -1)
+                nonopt_start = optind;
+            else if (nonopt_end != -1)
+            {
+                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                nonopt_start = optind - (nonopt_end - nonopt_start);
+                nonopt_end = -1;
+            }
+            optind++;
+            /* process next argument */
+            goto start;
+        }
+        if (nonopt_start != -1 && nonopt_end == -1)
+            nonopt_end = optind;
+
+        /*
+         * If we have "-" do nothing, if "--" we are done.
+         */
+        if (place[1] != '\0' && *++place == '-' && place[1] == '\0')
+        {
+            optind++;
+            place = EMSG;
+            /*
+             * We found an option (--), so if we skipped
+             * non-options, we have to permute.
+             */
+            if (nonopt_end != -1)
+            {
+                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                optind -= nonopt_end - nonopt_start;
+            }
+            nonopt_start = nonopt_end = -1;
+            return (-1);
+        }
+    }
+
+    /*
+     * Check long options if:
+     *  1) we were passed some
+     *  2) the arg is not just "-"
+     *  3) either the arg starts with -- we are getopt_long_only()
+     */
+    if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY)))
+    {
+        short_too = 0;
+        if (*place == '-')
+            place++; /* --foo long option */
+        else if (*place != ':' && strchr(options, *place) != NULL)
+            short_too = 1; /* could be short option too */
+
+        optchar = parse_long_options(nargv, options, long_options, idx, short_too);
+        if (optchar != -1)
+        {
+            place = EMSG;
+            return (optchar);
+        }
+    }
+
+    if ((optchar = (int) *place++) == (int) ':' || (optchar == (int) '-' && *place != '\0')
+        || (oli = strchr(options, optchar)) == NULL)
+    {
+        /*
+         * If the user specified "-" and  '-' isn't listed in
+         * options, return -1 (non-option) as per POSIX.
+         * Otherwise, it is an unknown option character (or ':').
+         */
+        if (optchar == (int) '-' && *place == '\0')
+            return (-1);
+        if (!*place)
+            ++optind;
+        if (PRINT_ERROR)
+            warnx(illoptchar, optchar);
+        optopt = optchar;
+        return (BADCH);
+    }
+    if (long_options != NULL && optchar == 'W' && oli[1] == ';')
+    {
+        /* -W long-option */
+        if (*place) /* no space */
+            /* NOTHING */;
+        else if (++optind >= nargc)
+        { /* no arg */
+            place = EMSG;
+            if (PRINT_ERROR)
+                warnx(recargchar, optchar);
+            optopt = optchar;
+            return (BADARG);
+        }
+        else /* white space */
+            place = nargv[optind];
+        optchar = parse_long_options(nargv, options, long_options, idx, 0);
+        place = EMSG;
+        return (optchar);
+    }
+    if (*++oli != ':')
+    { /* doesn't take argument */
+        if (!*place)
+            ++optind;
+    }
+    else
+    { /* takes (optional) argument */
+        optarg = NULL;
+        if (*place) /* no white space */
+            optarg = place;
+        else if (oli[1] != ':')
+        { /* arg not optional */
+            if (++optind >= nargc)
+            { /* no arg */
+                place = EMSG;
+                if (PRINT_ERROR)
+                    warnx(recargchar, optchar);
+                optopt = optchar;
+                return (BADARG);
+            }
+            else
+                optarg = nargv[optind];
+        }
+        place = EMSG;
+        ++optind;
+    }
+    /* dump back option letter */
+    return (optchar);
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the BSD getopt]
+ */
+int getopt(int nargc, char* const* nargv, char const* options)
+{
+
+    /*
+     * We don't pass FLAG_PERMUTE to getopt_internal() since
+     * the BSD getopt(3) (unlike GNU) has never done this.
+     *
+     * Furthermore, since many privileged programs call getopt()
+     * before dropping privileges it makes sense to keep things
+     * as simple (and bug-free) as possible.
+     */
+    return (getopt_internal(nargc, nargv, options, NULL, NULL, 0));
+}
+#endif /* REPLACE_GETOPT */
+
+/*
+ * getopt_long --
+ *	Parse argc/argv argument vector.
+ */
+int getopt_long(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx)
+{
+
+    return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE));
+}
+
+/*
+ * getopt_long_only --
+ *	Parse argc/argv argument vector.
+ */
+int getopt_long_only(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx)
+{
+
+    return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE | FLAG_LONGONLY));
+}
diff --git a/src/Detector/tensorrt_yolo/common/getoptWin.h b/src/Detector/tensorrt_yolo/common/getoptWin.h
new file mode 100644
index 00000000..a1dc6ffa
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/getoptWin.h
@@ -0,0 +1,124 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __GETOPT_H__
+/**
+ * DISCLAIMER
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is a part of the w64 mingw-runtime package.
+ *
+ * The w64 mingw-runtime package and its code is distributed in the hope that it
+ * will be useful but WITHOUT ANY WARRANTY.  ALL WARRANTIES, EXPRESSED OR
+ * IMPLIED ARE HEREBY DISCLAIMED.  This includes but is not limited to
+ * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#define __GETOPT_H__
+
+/* All the headers include this file. */
+#include <crtdefs.h>
+
+#if defined(WINGETOPT_SHARED_LIB)
+#if defined(BUILDING_WINGETOPT_DLL)
+#define WINGETOPT_API __declspec(dllexport)
+#else
+#define WINGETOPT_API __declspec(dllimport)
+#endif
+#else
+#define WINGETOPT_API
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    WINGETOPT_API extern int optind; /* index of first non-option in argv      */
+    WINGETOPT_API extern int optopt; /* single option character, as parsed     */
+    WINGETOPT_API extern int opterr; /* flag to enable built-in diagnostics... */
+    /* (user may set to zero, to suppress)    */
+
+    WINGETOPT_API extern char* optarg; /* pointer to argument of current option  */
+
+    extern int getopt(int nargc, char* const* nargv, char const* options);
+
+#ifdef _BSD_SOURCE
+/*
+ * BSD adds the non-standard `optreset' feature, for reinitialisation
+ * of `getopt' parsing.  We support this feature, for applications which
+ * proclaim their BSD heritage, before including this header; however,
+ * to maintain portability, developers are advised to avoid it.
+ */
+#define optreset __mingw_optreset
+    extern int optreset;
+#endif
+#ifdef __cplusplus
+}
+#endif
+/*
+ * POSIX requires the `getopt' API to be specified in `unistd.h';
+ * thus, `unistd.h' includes this header.  However, we do not want
+ * to expose the `getopt_long' or `getopt_long_only' APIs, when
+ * included in this manner.  Thus, close the standard __GETOPT_H__
+ * declarations block, and open an additional __GETOPT_LONG_H__
+ * specific block, only when *not* __UNISTD_H_SOURCED__, in which
+ * to declare the extended API.
+ */
+#endif /* !defined(__GETOPT_H__) */
+
+#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__)
+#define __GETOPT_LONG_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    struct option /* specification for a long form option...	*/
+    {
+        char const* name; /* option name, without leading hyphens */
+        int has_arg;      /* does it take an argument?		*/
+        int* flag;        /* where to save its status, or NULL	*/
+        int val;          /* its associated status value		*/
+    };
+
+    enum /* permitted values for its `has_arg' field...	*/
+    {
+        no_argument = 0,   /* option never takes an argument	*/
+        required_argument, /* option always requires an argument	*/
+        optional_argument  /* option may take an argument		*/
+    };
+
+    extern int getopt_long(
+        int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx);
+    extern int getopt_long_only(
+        int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx);
+/*
+ * Previous MinGW implementation had...
+ */
+#ifndef HAVE_DECL_GETOPT
+/*
+ * ...for the long form API only; keep this for compatibility.
+ */
+#define HAVE_DECL_GETOPT 1
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */
diff --git a/src/Detector/tensorrt_yolo/common/half.h b/src/Detector/tensorrt_yolo/common/half.h
index 0755c316..b997e7db 100644
--- a/src/Detector/tensorrt_yolo/common/half.h
+++ b/src/Detector/tensorrt_yolo/common/half.h
@@ -16,13 +16,14 @@
 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -1522,14 +1523,14 @@ class half
     /// \return incremented half value
     half& operator++()
     {
-        return *this += 1.0f;
+        return *this += 1.0F;
     }
 
     /// Prefix decrement.
     /// \return decremented half value
     half& operator--()
     {
-        return *this -= 1.0f;
+        return *this -= 1.0F;
     }
 
     /// Postfix increment.
diff --git a/src/Detector/tensorrt_yolo/common/logger.cpp b/src/Detector/tensorrt_yolo/common/logger.cpp
index 03c64398..909ec0bb 100644
--- a/src/Detector/tensorrt_yolo/common/logger.cpp
+++ b/src/Detector/tensorrt_yolo/common/logger.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,7 +18,7 @@
 #include "logger.h"
 #include "ErrorRecorder.h"
 #include "logging.h"
-
+using namespace nvinfer1;
 SampleErrorRecorder gRecorder;
 namespace sample
 {
diff --git a/src/Detector/tensorrt_yolo/common/logger.h b/src/Detector/tensorrt_yolo/common/logger.h
index 3069e8e9..8205e457 100644
--- a/src/Detector/tensorrt_yolo/common/logger.h
+++ b/src/Detector/tensorrt_yolo/common/logger.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/src/Detector/tensorrt_yolo/common/logging.h b/src/Detector/tensorrt_yolo/common/logging.h
index 78732c10..69273a5e 100644
--- a/src/Detector/tensorrt_yolo/common/logging.h
+++ b/src/Detector/tensorrt_yolo/common/logging.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,7 +18,7 @@
 #ifndef TENSORRT_LOGGING_H
 #define TENSORRT_LOGGING_H
 
-#include "NvInferRuntimeCommon.h"
+#include "NvInferRuntime.h"
 #include "sampleOptions.h"
 #include <cassert>
 #include <ctime>
@@ -162,7 +163,7 @@ class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
     }
     LogStreamConsumer(const LogStreamConsumer& other) = delete;
     LogStreamConsumer() = delete;
-    ~LogStreamConsumer() = default;
+    ~LogStreamConsumer() override = default;
     LogStreamConsumer& operator=(const LogStreamConsumer&) = delete;
     LogStreamConsumer& operator=(LogStreamConsumer&&) = delete;
 
@@ -291,7 +292,7 @@ class Logger : public nvinfer1::ILogger
     };
 
     //!
-    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
+    //! \brief Forward-compatible method for retrieving the nvinfer1::ILogger associated with this Logger
     //! \return The nvinfer1::ILogger associated with this Logger
     //!
     //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
@@ -353,7 +354,7 @@ class Logger : public nvinfer1::ILogger
     //!
     //! \brief Define a test for logging
     //!
-    //! \param[in] name The name of the test.  This should be a string starting with
+    //! \param[in] name The name of the test. This should be a string starting with
     //!                  "TensorRT" and containing dot-separated strings containing
     //!                  the characters [A-Za-z0-9_].
     //!                  For example, "TensorRT.sample_googlenet"
@@ -379,7 +380,8 @@ class Logger : public nvinfer1::ILogger
     static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv)
     {
         // Append TensorRT version as info
-        const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]";
+        const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "] [b"
+            + std::to_string(NV_TENSORRT_BUILD) + "]";
         auto cmdline = genCmdlineString(argc, argv);
         return defineTest(vname, cmdline);
     }
diff --git a/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h b/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h
index c92a1420..67ee6c71 100644
--- a/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h
+++ b/src/Detector/tensorrt_yolo/common/parserOnnxConfig.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -35,15 +36,13 @@
  *
  */
 
-using namespace std;
-
 class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
 {
 
 protected:
-    string mModelFilename{};
-    string mTextFilename{};
-    string mFullTextFilename{};
+    std::string mModelFilename{};
+    std::string mTextFilename{};
+    std::string mFullTextFilename{};
     nvinfer1::DataType mModelDtype;
     nvonnxparser::IOnnxConfig::Verbosity mVerbosity;
     bool mPrintLayercInfo;
@@ -62,8 +61,7 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
 #endif
     }
 
-protected:
-    ~ParserOnnxConfig()
+    ~ParserOnnxConfig() override
     {
 #ifdef ONNX_DEBUG
         if (isDebug())
@@ -74,62 +72,62 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
     }
 
 public:
-    virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept
+    void setModelDtype(const nvinfer1::DataType modelDtype) noexcept override
     {
         mModelDtype = modelDtype;
     }
 
-    virtual nvinfer1::DataType getModelDtype() const noexcept
+    nvinfer1::DataType getModelDtype() const noexcept override
     {
         return mModelDtype;
     }
 
-    virtual const char* getModelFileName() const noexcept
+    const char* getModelFileName() const noexcept override
     {
         return mModelFilename.c_str();
     }
-    virtual void setModelFileName(const char* onnxFilename) noexcept
+    void setModelFileName(const char* onnxFilename) noexcept override
     {
-        mModelFilename = string(onnxFilename);
+        mModelFilename = std::string(onnxFilename);
     }
-    virtual nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept
+    nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept override
     {
         return mVerbosity;
     }
-    virtual void addVerbosity() noexcept
+    void addVerbosity() noexcept override
     {
         ++mVerbosity;
     }
-    virtual void reduceVerbosity() noexcept
+    void reduceVerbosity() noexcept override
     {
         --mVerbosity;
     }
-    virtual void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept
+    void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept override
     {
         mVerbosity = verbosity;
     }
 
-    virtual const char* getTextFileName() const noexcept
+    const char* getTextFileName() const noexcept override
     {
         return mTextFilename.c_str();
     }
-    virtual void setTextFileName(const char* textFilename) noexcept
+    void setTextFileName(const char* textFilename) noexcept override
     {
-        mTextFilename = string(textFilename);
+        mTextFilename = std::string(textFilename);
     }
-    virtual const char* getFullTextFileName() const noexcept
+    const char* getFullTextFileName() const noexcept override
     {
         return mFullTextFilename.c_str();
     }
-    virtual void setFullTextFileName(const char* fullTextFilename) noexcept
+    void setFullTextFileName(const char* fullTextFilename) noexcept override
     {
-        mFullTextFilename = string(fullTextFilename);
+        mFullTextFilename = std::string(fullTextFilename);
     }
-    virtual bool getPrintLayerInfo() const noexcept
+    bool getPrintLayerInfo() const noexcept override
     {
         return mPrintLayercInfo;
     }
-    virtual void setPrintLayerInfo(bool src) noexcept
+    void setPrintLayerInfo(bool src) noexcept override
     {
         mPrintLayercInfo = src;
     } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo()
@@ -142,12 +140,6 @@ class ParserOnnxConfig : public nvonnxparser::IOnnxConfig
         return false;
 #endif
     }
-
-    virtual void destroy() noexcept
-    {
-        delete this;
-    }
-
 }; // class ParserOnnxConfig
 
 #endif
diff --git a/src/Detector/tensorrt_yolo/common/safeCommon.h b/src/Detector/tensorrt_yolo/common/safeCommon.h
index 3d84b095..f10aad18 100644
--- a/src/Detector/tensorrt_yolo/common/safeCommon.h
+++ b/src/Detector/tensorrt_yolo/common/safeCommon.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,13 +18,32 @@
 #ifndef TENSORRT_SAFE_COMMON_H
 #define TENSORRT_SAFE_COMMON_H
 
-#include "NvInferRuntimeCommon.h"
+#include "cuda_runtime.h"
+#include "sampleEntrypoints.h"
+#include <cmath>
 #include <cstdlib>
+#include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <stdexcept>
 #include <string>
 
+// For safeLoadLibrary
+#ifdef _MSC_VER
+// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#else
+#include <dlfcn.h>
+#endif
+#if IS_QNX_SAFE
+#include <cuda_runtime_api_safe_ex.h>
+#include <sys/procmgr.h>
+#endif // IS_QNX_SAFE
+
+#undef CHECK
 #define CHECK(status)                                                                                                  \
     do                                                                                                                 \
     {                                                                                                                  \
@@ -31,10 +51,92 @@
         if (ret != 0)                                                                                                  \
         {                                                                                                              \
             std::cerr << "Cuda failure: " << ret << std::endl;                                                         \
-            abort();                                                                                                   \
+            exit(EXIT_FAILURE);                                                                                        \
         }                                                                                                              \
     } while (0)
 
+#undef SAFE_ASSERT
+#define SAFE_ASSERT(condition)                                                                                         \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (!(condition))                                                                                              \
+        {                                                                                                              \
+            std::cerr << "Assertion failure: " << #condition << std::endl;                                             \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in.
+//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path.
+inline std::string locateFile(
+    const std::string& filepathSuffix, const std::vector<std::string>& directories, bool reportError = true)
+{
+    const int MAX_DEPTH{10};
+    bool found{false};
+    std::string filepath;
+
+    for (auto& dir : directories)
+    {
+        if (!dir.empty() && dir.back() != '/')
+        {
+#ifdef _MSC_VER
+            filepath = dir + "\\" + filepathSuffix;
+#else
+            filepath = dir + "/" + filepathSuffix;
+#endif
+        }
+        else
+        {
+            filepath = dir + filepathSuffix;
+        }
+
+        for (int i = 0; i < MAX_DEPTH && !found; i++)
+        {
+            const std::ifstream checkFile(filepath);
+            found = checkFile.is_open();
+            if (found)
+            {
+                break;
+            }
+
+            filepath = "../" + filepath; // Try again in parent dir
+        }
+
+        if (found)
+        {
+            break;
+        }
+
+        filepath.clear();
+    }
+
+    // Could not find the file
+    if (filepath.empty())
+    {
+        const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(),
+            [](const std::string& a, const std::string& b) { return a + "\n\t" + b; });
+        std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl;
+
+        if (reportError)
+        {
+            std::cout << "&&&& FAILED" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    return filepath;
+}
+
+inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int32_t inH, int32_t inW)
+{
+    std::ifstream infile(fileName, std::ifstream::binary);
+    SAFE_ASSERT(infile.is_open() && "Attempting to read from a file that is not open.");
+    std::string magic, w, h, max;
+    infile >> magic >> w >> h >> max;
+    infile.seekg(1, infile.cur);
+    infile.read(reinterpret_cast<char*>(buffer), inH * inW);
+}
+
 namespace samplesCommon
 {
 template <typename T>
@@ -51,11 +153,17 @@ inline uint32_t elementSize(nvinfer1::DataType t)
 {
     switch (t)
     {
+    case nvinfer1::DataType::kINT64: return 8;
     case nvinfer1::DataType::kINT32:
     case nvinfer1::DataType::kFLOAT: return 4;
-    case nvinfer1::DataType::kHALF: return 2;
-    case nvinfer1::DataType::kINT8: return 1;
-    case nvinfer1::DataType::kBOOL: return 1;
+    case nvinfer1::DataType::kHALF:
+    case nvinfer1::DataType::kBF16: return 2;
+    case nvinfer1::DataType::kINT8:
+    case nvinfer1::DataType::kUINT8:
+    case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kFP8: return 1;
+    case nvinfer1::DataType::kINT4:
+        SAFE_ASSERT(false && "Element size is not implemented for sub-byte data-types");
     }
     return 0;
 }
@@ -66,6 +174,205 @@ inline A divUp(A x, B n)
     return (x + n - 1) / n;
 }
 
+inline int64_t volume(nvinfer1::Dims const& d)
+{
+    return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies<int64_t>{});
+}
+
+//! Return m rounded up to nearest multiple of n
+template <typename T1, typename T2>
+inline T1 roundUp(T1 m, T2 n)
+{
+    static_assert(std::is_integral<T1>::value && std::is_integral<T2>::value, "arguments must be integers");
+    static_assert(std::is_signed<T1>::value == std::is_signed<T2>::value, "mixed signedness not allowed");
+    static_assert(sizeof(T1) >= sizeof(T2), "first type must be as least as wide as second type");
+    return ((m + n - 1) / n) * n;
+}
+
+//! comps is the number of components in a vector. Ignored if vecDim < 0.
+inline int64_t volume(nvinfer1::Dims dims, int32_t vecDim, int32_t comps, int32_t batch)
+{
+    if (vecDim >= 0)
+    {
+        dims.d[vecDim] = roundUp(dims.d[vecDim], comps);
+    }
+    return samplesCommon::volume(dims) * std::max(batch, 1);
+}
+
+inline int32_t getSMVersion()
+{
+#if 0
+    // Use default value for 4090
+    int32_t major{8};
+    int32_t minor{9};
+#else
+    int32_t major{};
+    int32_t minor{};
+    int32_t deviceIndex{};
+    CHECK(cudaGetDevice(&deviceIndex));
+    CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex));
+    CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex));
+#endif
+    return ((major << 8) | minor);
+}
+
+inline bool isSMSafe()
+{
+    const int32_t smVersion = getSMVersion();
+    return smVersion == 0x0700 || smVersion == 0x0705 || smVersion == 0x0800 || smVersion == 0x0806
+        || smVersion == 0x0807;
+}
+
+inline int32_t calculateSoftmax(float* const prob, int32_t const numDigits)
+{
+    SAFE_ASSERT(prob != nullptr);
+    SAFE_ASSERT(numDigits == 10);
+    float sum{0.0F};
+    std::transform(prob, prob + numDigits, prob, [&sum](float v) -> float {
+        sum += exp(v);
+        return exp(v);
+    });
+
+    SAFE_ASSERT(sum != 0.0F);
+    std::transform(prob, prob + numDigits, prob, [sum](float v) -> float { return v / sum; });
+    int32_t idx = std::max_element(prob, prob + numDigits) - prob;
+    return idx;
+}
+
+//!
+//! \class TrtCudaGraphSafe
+//! \brief Managed CUDA graph
+//!
+class TrtCudaGraphSafe
+{
+public:
+    explicit TrtCudaGraphSafe() = default;
+
+    TrtCudaGraphSafe(const TrtCudaGraphSafe&) = delete;
+
+    TrtCudaGraphSafe& operator=(const TrtCudaGraphSafe&) = delete;
+
+    TrtCudaGraphSafe(TrtCudaGraphSafe&&) = delete;
+
+    TrtCudaGraphSafe& operator=(TrtCudaGraphSafe&&) = delete;
+
+    ~TrtCudaGraphSafe()
+    {
+        if (mGraphExec)
+        {
+            cudaGraphExecDestroy(mGraphExec);
+        }
+    }
+
+    void beginCapture(cudaStream_t& stream)
+    {
+        // cudaStreamCaptureModeGlobal is the only allowed mode in SAFE CUDA
+        CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+    }
+
+    bool launch(cudaStream_t& stream)
+    {
+        return cudaGraphLaunch(mGraphExec, stream) == cudaSuccess;
+    }
+
+    void endCapture(cudaStream_t& stream)
+    {
+        CHECK(cudaStreamEndCapture(stream, &mGraph));
+        CHECK(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0));
+        CHECK(cudaGraphDestroy(mGraph));
+    }
+
+    void endCaptureOnError(cudaStream_t& stream)
+    {
+        // There are two possibilities why stream capture would fail:
+        // (1) stream is in cudaErrorStreamCaptureInvalidated state.
+        // (2) TRT reports a failure.
+        // In case (1), the returning mGraph should be nullptr.
+        // In case (2), the returning mGraph is not nullptr, but it should not be used.
+        const auto ret = cudaStreamEndCapture(stream, &mGraph);
+        if (ret == cudaErrorStreamCaptureInvalidated)
+        {
+            SAFE_ASSERT(mGraph == nullptr);
+        }
+        else
+        {
+            SAFE_ASSERT(ret == cudaSuccess);
+            SAFE_ASSERT(mGraph != nullptr);
+            CHECK(cudaGraphDestroy(mGraph));
+            mGraph = nullptr;
+        }
+        // Clean up any CUDA error.
+        cudaGetLastError();
+        sample::gLogError << "The CUDA graph capture on the stream has failed." << std::endl;
+    }
+
+private:
+    cudaGraph_t mGraph{};
+    cudaGraphExec_t mGraphExec{};
+};
+
+inline void safeLoadLibrary(const std::string& path)
+{
+#ifdef _MSC_VER
+    void* handle = LoadLibraryA(path.c_str());
+#else
+    int32_t flags{RTLD_LAZY};
+    void* handle = dlopen(path.c_str(), flags);
+#endif
+    if (handle == nullptr)
+    {
+#ifdef _MSC_VER
+        sample::gLogError << "Could not load plugin library: " << path << std::endl;
+#else
+        sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl;
+#endif
+    }
+}
+
+inline std::vector<std::string> safeSplitString(std::string str, char delimiter = ',')
+{
+    std::vector<std::string> splitVect;
+    std::stringstream ss(str);
+    std::string substr;
+
+    while (ss.good())
+    {
+        getline(ss, substr, delimiter);
+        splitVect.emplace_back(std::move(substr));
+    }
+    return splitVect;
+}
+
 } // namespace samplesCommon
 
+namespace safetyCompliance
+{
+inline void initSafeCuda()
+{
+    // According to CUDA initialization in NVIDIA CUDA SAFETY API REFERENCE FOR DRIVE OS
+    // We will need to do the following in order
+    // 1. Initialize the calling thread with CUDA specific information (Call any CUDA RT API identified as init)
+    // 2. Query/Configure and choose the desired CUDA device
+    // 3. CUDA context initialization. (Call cudaDeviceGetLimit or cuCtxCreate)
+    size_t stackSizeLimit = 0;
+    int32_t deviceIndex = 0;
+    CHECK(cudaGetDevice(&deviceIndex));
+    CHECK(cudaDeviceGetLimit(&stackSizeLimit, cudaLimitStackSize));
+#if IS_QNX_SAFE
+    CHECK(cudaSafeExSelectAPIMode(cudaSafeExAPIModeAsilB));
+#endif // IS_QNX_SAFE
+}
+
+inline void setPromgrAbility()
+{
+#if IS_QNX_SAFE
+    // Comply with DEEPLRN_RES_117 on QNX-safe by dropping PROCMGR_AID_MEM_PHYS ability and locking out any further
+    // changes
+    procmgr_ability(
+        0, PROCMGR_ADN_NONROOT | PROCMGR_AOP_DENY | PROCMGR_AOP_LOCK | PROCMGR_AID_MEM_PHYS, PROCMGR_AID_EOL);
+#endif // IS_QNX_SAFE
+}
+
+} // namespace safetyCompliance
+
 #endif // TENSORRT_SAFE_COMMON_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleConfig.h b/src/Detector/tensorrt_yolo/common/sampleConfig.h
index 53a78331..801a268a 100644
--- a/src/Detector/tensorrt_yolo/common/sampleConfig.h
+++ b/src/Detector/tensorrt_yolo/common/sampleConfig.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -55,9 +56,9 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     bool mDebugBuilder{false};
     InputDataFormat mInputDataFormat{InputDataFormat::kASCII};
     uint64_t mTopK{0};
-    float mFailurePercentage{-1.0f};
-    float mTolerance{0.0f};
-    float mAbsTolerance{1e-5f};
+    float mFailurePercentage{-1.0F};
+    float mTolerance{0.0F};
+    float mAbsTolerance{1e-5F};
 
 public:
     SampleConfig()
@@ -70,8 +71,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
 #endif
     }
 
-protected:
-    ~SampleConfig()
+    ~SampleConfig() override
     {
 #ifdef ONNX_DEBUG
         if (isDebug())
@@ -82,12 +82,12 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     }
 
 public:
-    void setModelDtype(const nvinfer1::DataType mdt) noexcept
+    void setModelDtype(const nvinfer1::DataType mdt) noexcept override
     {
         mModelDtype = mdt;
     }
 
-    nvinfer1::DataType getModelDtype() const noexcept
+    nvinfer1::DataType getModelDtype() const noexcept override
     {
         return mModelDtype;
     }
@@ -102,28 +102,28 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         mTF32 = enabled;
     }
 
-    const char* getModelFileName() const noexcept
+    const char* getModelFileName() const noexcept override
     {
         return mModelFilename.c_str();
     }
 
-    void setModelFileName(const char* onnxFilename) noexcept
+    void setModelFileName(const char* onnxFilename) noexcept override
     {
         mModelFilename = std::string(onnxFilename);
     }
-    Verbosity getVerbosityLevel() const noexcept
+    Verbosity getVerbosityLevel() const noexcept override
     {
         return mVerbosity;
     }
-    void addVerbosity() noexcept
+    void addVerbosity() noexcept override
     {
         ++mVerbosity;
     }
-    void reduceVerbosity() noexcept
+    void reduceVerbosity() noexcept override
     {
         --mVerbosity;
     }
-    virtual void setVerbosityLevel(Verbosity v) noexcept
+    void setVerbosityLevel(Verbosity v) noexcept override
     {
         mVerbosity = v;
     }
@@ -135,19 +135,19 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     {
         mEngineFilename = std::string(engineFilename);
     }
-    const char* getTextFileName() const noexcept
+    const char* getTextFileName() const noexcept override
     {
         return mTextFilename.c_str();
     }
-    void setTextFileName(const char* textFilename) noexcept
+    void setTextFileName(const char* textFilename) noexcept override
     {
         mTextFilename = std::string(textFilename);
     }
-    const char* getFullTextFileName() const noexcept
+    const char* getFullTextFileName() const noexcept override
     {
         return mFullTextFilename.c_str();
     }
-    void setFullTextFileName(const char* fullTextFilename) noexcept
+    void setFullTextFileName(const char* fullTextFilename) noexcept override
     {
         mFullTextFilename = std::string(fullTextFilename);
     }
@@ -161,12 +161,12 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         return mLabel;
     } //!<  get the Label
 
-    bool getPrintLayerInfo() const noexcept
+    bool getPrintLayerInfo() const noexcept override
     {
         return mPrintLayercInfo;
     }
 
-    void setPrintLayerInfo(bool b) noexcept
+    void setPrintLayerInfo(bool b) noexcept override
     {
         mPrintLayercInfo = b;
     } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo()
@@ -312,7 +312,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     {
         return mTimingCacheFilename.c_str();
     }
-    
+
     void setTimingCacheFileName(const char* timingCacheFilename) noexcept
     {
         mTimingCacheFilename = std::string(timingCacheFilename);
@@ -326,12 +326,6 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         return false;
 #endif
     }
-
-    void destroy() noexcept
-    {
-        delete this;
-    }
-
 }; // class SampleConfig
 
 #endif
diff --git a/src/Detector/tensorrt_yolo/common/sampleDevice.cpp b/src/Detector/tensorrt_yolo/common/sampleDevice.cpp
new file mode 100644
index 00000000..7964aeb5
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/sampleDevice.cpp
@@ -0,0 +1,133 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sampleDevice.h"
+
+#include <iomanip>
+
+namespace sample
+{
+
+void cudaCheck(cudaError_t ret, std::ostream& err)
+{
+    if (ret != cudaSuccess)
+    {
+        err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+// Construct GPU UUID string in the same format as nvidia-smi does.
+std::string getUuidString(cudaUUID_t uuid)
+{
+    constexpr int32_t kUUID_SIZE = sizeof(cudaUUID_t);
+    static_assert(kUUID_SIZE == 16, "Unexpected size for cudaUUID_t!");
+
+    std::ostringstream ss;
+    std::vector<int32_t> const splits = {0, 4, 6, 8, 10, kUUID_SIZE};
+
+    ss << "GPU" << std::hex << std::setfill('0');
+    for (int32_t splitIdx = 0; splitIdx < static_cast<int32_t>(splits.size()) - 1; ++splitIdx)
+    {
+        ss << "-";
+        for (int32_t byteIdx = splits[splitIdx]; byteIdx < splits[splitIdx + 1]; ++byteIdx)
+        {
+            ss << std::setw(2) << +static_cast<uint8_t>(uuid.bytes[byteIdx]);
+        }
+    }
+    return ss.str();
+}
+
+void setCudaDevice(int32_t device, std::ostream& os)
+{
+#if !TRT_WINML
+    os << "=== Device Information ===" << std::endl;
+
+    // Get the number of visible GPUs.
+    int32_t nbDevices{-1};
+    cudaCheck(cudaGetDeviceCount(&nbDevices));
+
+    if (nbDevices <= 0)
+    {
+        os << "Cannot find any available devices (GPUs)!" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // Print out the GPU name and PCIe bus ID of each GPU.
+    os << "Available Devices: " << std::endl;
+    cudaDeviceProp properties;
+    for (int32_t deviceIdx = 0; deviceIdx < nbDevices; ++deviceIdx)
+    {
+        cudaDeviceProp tempProperties;
+        cudaCheck(cudaGetDeviceProperties(&tempProperties, deviceIdx));
+
+        // clang-format off
+        os << "  Device " << deviceIdx << ": \"" << tempProperties.name << "\" UUID: "
+           << getUuidString(tempProperties.uuid) << std::endl;
+        // clang-format on
+
+        // Record the properties of the desired GPU.
+        if (deviceIdx == device)
+        {
+            properties = tempProperties;
+        }
+    }
+
+    // Exit with error if the requested device ID does not exist.
+    if (device < 0 || device >= nbDevices)
+    {
+        os << "Cannot find device ID " << device << "!" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // Set to the corresponding GPU.
+    cudaCheck(cudaSetDevice(device));
+
+    // clang-format off
+    os << "Selected Device: "      << properties.name                                               << std::endl;
+    os << "Selected Device ID: "   << device                                                        << std::endl;
+    os << "Selected Device UUID: " << getUuidString(properties.uuid)                                << std::endl;
+    os << "Compute Capability: "   << properties.major << "." << properties.minor                   << std::endl;
+    os << "SMs: "                  << properties.multiProcessorCount                                << std::endl;
+    os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB"                   << std::endl;
+    os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB"       << std::endl;
+    os << "Memory Bus Width: "     << properties.memoryBusWidth << " bits"
+                        << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
+    os << "Application Compute Clock Rate: "   << properties.clockRate / 1000000.0F << " GHz"       << std::endl;
+    os << "Application Memory Clock Rate: "    << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl;
+    os << std::endl;
+    os << "Note: The application clock rates do not reflect the actual clock rates that the GPU is "
+                                                                         << "currently running at." << std::endl;
+    // clang-format on
+#endif
+}
+
+int32_t getCudaDriverVersion()
+{
+    int32_t version{-1};
+    cudaCheck(cudaDriverGetVersion(&version));
+    return version;
+}
+
+int32_t getCudaRuntimeVersion()
+{
+    int32_t version{-1};
+    cudaCheck(cudaRuntimeGetVersion(&version));
+    return version;
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleDevice.h b/src/Detector/tensorrt_yolo/common/sampleDevice.h
index 2053ac7c..986dccb4 100644
--- a/src/Detector/tensorrt_yolo/common/sampleDevice.h
+++ b/src/Detector/tensorrt_yolo/common/sampleDevice.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -23,17 +24,13 @@
 #include <iostream>
 #include <thread>
 
+#include "sampleUtils.h"
+
 namespace sample
 {
 
-inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr)
-{
-    if (ret != cudaSuccess)
-    {
-        err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl;
-        abort();
-    }
-}
+//! Check if the CUDA return status shows any error. If so, exit the program immediately.
+void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr);
 
 class TrtCudaEvent;
 
@@ -238,16 +235,18 @@ class TrtCudaBuffer
 
     TrtCudaBuffer(TrtCudaBuffer&& rhs)
     {
-        reset(rhs.mPtr);
+        reset(rhs.mPtr, rhs.mSize);
         rhs.mPtr = nullptr;
+        rhs.mSize = 0;
     }
 
     TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs)
     {
         if (this != &rhs)
         {
-            reset(rhs.mPtr);
+            reset(rhs.mPtr, rhs.mSize);
             rhs.mPtr = nullptr;
+            rhs.mSize = 0;
         }
         return *this;
     }
@@ -260,21 +259,24 @@ class TrtCudaBuffer
     TrtCudaBuffer(size_t size)
     {
         A()(&mPtr, size);
+        mSize = size;
     }
 
     void allocate(size_t size)
     {
         reset();
         A()(&mPtr, size);
+        mSize = size;
     }
 
-    void reset(void* ptr = nullptr)
+    void reset(void* ptr = nullptr, size_t size = 0)
     {
         if (mPtr)
         {
             D()(mPtr);
         }
         mPtr = ptr;
+        mSize = size;
     }
 
     void* get() const
@@ -282,8 +284,14 @@ class TrtCudaBuffer
         return mPtr;
     }
 
+    size_t getSize() const
+    {
+        return mSize;
+    }
+
 private:
     void* mPtr{nullptr};
+    size_t mSize{0};
 };
 
 struct DeviceAllocator
@@ -383,39 +391,39 @@ class IMirroredBuffer
 }; // class IMirroredBuffer
 
 //!
-//! Class to have a seperate memory buffer for discrete device and host allocations.
+//! Class to have a separate memory buffer for discrete device and host allocations.
 //!
 class DiscreteMirroredBuffer : public IMirroredBuffer
 {
 public:
-    void allocate(size_t size)
+    void allocate(size_t size) override
     {
         mSize = size;
         mHostBuffer.allocate(size);
         mDeviceBuffer.allocate(size);
     }
 
-    void* getDeviceBuffer() const
+    void* getDeviceBuffer() const override
     {
         return mDeviceBuffer.get();
     }
 
-    void* getHostBuffer() const
+    void* getHostBuffer() const override
     {
         return mHostBuffer.get();
     }
 
-    void hostToDevice(TrtCudaStream& stream)
+    void hostToDevice(TrtCudaStream& stream) override
     {
         cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get()));
     }
 
-    void deviceToHost(TrtCudaStream& stream)
+    void deviceToHost(TrtCudaStream& stream) override
     {
         cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get()));
     }
 
-    size_t getSize() const
+    size_t getSize() const override
     {
         return mSize;
     }
@@ -432,33 +440,33 @@ class DiscreteMirroredBuffer : public IMirroredBuffer
 class UnifiedMirroredBuffer : public IMirroredBuffer
 {
 public:
-    void allocate(size_t size)
+    void allocate(size_t size) override
     {
         mSize = size;
         mBuffer.allocate(size);
     }
 
-    void* getDeviceBuffer() const
+    void* getDeviceBuffer() const override
     {
         return mBuffer.get();
     }
 
-    void* getHostBuffer() const
+    void* getHostBuffer() const override
     {
         return mBuffer.get();
     }
 
-    void hostToDevice(TrtCudaStream& /*stream*/)
+    void hostToDevice(TrtCudaStream& stream) override
     {
         // Does nothing since we are using unified memory.
     }
 
-    void deviceToHost(TrtCudaStream& /*stream*/)
+    void deviceToHost(TrtCudaStream& stream) override
     {
         // Does nothing since we are using unified memory.
     }
 
-    size_t getSize() const
+    size_t getSize() const override
     {
         return mSize;
     }
@@ -468,26 +476,70 @@ class UnifiedMirroredBuffer : public IMirroredBuffer
     TrtManagedBuffer mBuffer;
 }; // class UnifiedMirroredBuffer
 
-inline void setCudaDevice(int device, std::ostream& os)
+//!
+//! Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is
+//! not possible.
+//!
+class OutputAllocator : public nvinfer1::IOutputAllocator
 {
-    cudaCheck(cudaSetDevice(device));
-
-    cudaDeviceProp properties;
-    cudaCheck(cudaGetDeviceProperties(&properties, device));
-
-// clang-format off
-    os << "=== Device Information ===" << std::endl;
-    os << "Selected Device: "      << properties.name                                               << std::endl;
-    os << "Compute Capability: "   << properties.major << "." << properties.minor                   << std::endl;
-    os << "SMs: "                  << properties.multiProcessorCount                                << std::endl;
-    os << "Compute Clock Rate: "   << properties.clockRate / 1000000.0F << " GHz"                   << std::endl;
-    os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB"                   << std::endl;
-    os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB"       << std::endl;
-    os << "Memory Bus Width: "     << properties.memoryBusWidth << " bits"
-                        << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
-    os << "Memory Clock Rate: "    << properties.memoryClockRate / 1000000.0F << " GHz"             << std::endl;
-    // clang-format on
-}
+public:
+    OutputAllocator(IMirroredBuffer* buffer)
+        : mBuffer(buffer)
+    {
+    }
+
+    void* reallocateOutput(
+        char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override
+    {
+        // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
+        // even for empty tensors, so allocate a dummy byte.
+        size = std::max(size, static_cast<uint64_t>(1));
+        if (size > mSize)
+        {
+            mBuffer->allocate(roundUp(size, alignment));
+            mSize = size;
+        }
+        return mBuffer->getDeviceBuffer();
+    }
+
+    //! IMirroredBuffer does not implement Async allocation, hence this is just a wrap around
+    void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment,
+        cudaStream_t /*stream*/) noexcept override
+    {
+        return reallocateOutput(tensorName, currentMemory, size, alignment);
+    }
+
+    void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override
+    {
+        mFinalDims = dims;
+    }
+
+    IMirroredBuffer* getBuffer()
+    {
+        return mBuffer.get();
+    }
+
+    nvinfer1::Dims getFinalDims()
+    {
+        return mFinalDims;
+    }
+
+    ~OutputAllocator() override {}
+
+private:
+    std::unique_ptr<IMirroredBuffer> mBuffer;
+    uint64_t mSize{};
+    nvinfer1::Dims mFinalDims;
+};
+
+//! Set the GPU to run the inference on.
+void setCudaDevice(int32_t device, std::ostream& os);
+
+//! Get the CUDA version of the current CUDA driver.
+int32_t getCudaDriverVersion();
+
+//! Get the CUDA version of the current CUDA runtime.
+int32_t getCudaRuntimeVersion();
 
 } // namespace sample
 
diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp
index 8bb8a8fe..dacf6f2a 100644
--- a/src/Detector/tensorrt_yolo/common/sampleEngines.cpp
+++ b/src/Detector/tensorrt_yolo/common/sampleEngines.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,6 +16,7 @@
  */
 
 #include <algorithm>
+#include <chrono>
 #include <fstream>
 #include <iostream>
 #include <iterator>
@@ -28,17 +30,16 @@
 #include "NvInfer.h"
 #include "NvOnnxParser.h"
 
-#include "common.h"
 #include "ErrorRecorder.h"
+#include "common.h"
 #include "half.h"
 #include "logger.h"
+#include "sampleDevice.h"
 #include "sampleEngines.h"
 #include "sampleOptions.h"
 #include "sampleUtils.h"
 
-#if !defined(_WIN32)
-#include <dlfcn.h>
-#endif
+using namespace nvinfer1;
 
 namespace sample
 {
@@ -46,7 +47,7 @@ namespace sample
 namespace
 {
 
-std::map<std::string, float> readScalesFromCalibrationCache(const std::string& calibrationFile)
+std::map<std::string, float> readScalesFromCalibrationCache(std::string const& calibrationFile)
 {
     std::map<std::string, float> tensorScales;
     std::ifstream cache{calibrationFile};
@@ -63,7 +64,7 @@ std::map<std::string, float> readScalesFromCalibrationCache(const std::string& c
         {
             // Scales should be stored in calibration cache as 32-bit floating numbers encoded as 32-bit integers
             int32_t scalesAsInt = std::stoi(line.substr(colonPos + 2, 8), nullptr, 16);
-            const auto tensorName = line.substr(0, colonPos);
+            auto const tensorName = line.substr(0, colonPos);
             tensorScales[tensorName] = *reinterpret_cast<float*>(&scalesAsInt);
         }
     }
@@ -72,69 +73,185 @@ std::map<std::string, float> readScalesFromCalibrationCache(const std::string& c
 }
 } // namespace
 
-void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector<IOFormat>& inputFormats,
-    const std::vector<IOFormat>& outputFormats, const std::string& calibrationFile)
+nvinfer1::ICudaEngine* LazilyDeserializedEngine::get()
+{
+    SMP_RETVAL_IF_FALSE(
+        !mIsSafe, "Safe mode is enabled, but trying to get standard engine!", nullptr, sample::gLogError);
+
+    if (mEngine == nullptr)
+    {
+        SMP_RETVAL_IF_FALSE(getFileReader().isOpen() || !getBlob().empty(), "Engine is empty. Nothing to deserialize!",
+            nullptr, sample::gLogError);
+
+        using time_point = std::chrono::time_point<std::chrono::high_resolution_clock>;
+        using duration = std::chrono::duration<float>;
+        time_point const deserializeStartTime{std::chrono::high_resolution_clock::now()};
+
+        if (mLeanDLLPath.empty())
+        {
+            mRuntime.reset(createRuntime());
+        }
+        else
+        {
+            mParentRuntime.reset(createRuntime());
+            ASSERT(mParentRuntime.get() != nullptr);
+
+            mRuntime.reset(mParentRuntime->loadRuntime(mLeanDLLPath.c_str()));
+        }
+        ASSERT(mRuntime.get() != nullptr);
+
+        if (mVersionCompatible)
+        {
+            // Application needs to opt into allowing deserialization of engines with embedded lean runtime.
+            mRuntime->setEngineHostCodeAllowed(true);
+        }
+
+        if (!mTempdir.empty())
+        {
+            mRuntime->setTemporaryDirectory(mTempdir.c_str());
+        }
+
+        mRuntime->setTempfileControlFlags(mTempfileControls);
+
+        SMP_RETVAL_IF_FALSE(mRuntime != nullptr, "runtime creation failed", nullptr, sample::gLogError);
+        if (mDLACore != -1)
+        {
+            mRuntime->setDLACore(mDLACore);
+        }
+        mRuntime->setErrorRecorder(&gRecorder);
+#if !TRT_WINML
+        for (auto const& pluginPath : mDynamicPlugins)
+        {
+            mRuntime->getPluginRegistry().loadLibrary(pluginPath.c_str());
+        }
+#endif
+
+        if (getFileReader().isOpen())
+        {
+            mEngine.reset(mRuntime->deserializeCudaEngine(getFileReader()));
+        }
+        else
+        {
+            auto const& engineBlob = getBlob();
+            mEngine.reset(mRuntime->deserializeCudaEngine(engineBlob.data, engineBlob.size));
+        }
+        SMP_RETVAL_IF_FALSE(mEngine != nullptr, "Engine deserialization failed", nullptr, sample::gLogError);
+
+        time_point const deserializeEndTime{std::chrono::high_resolution_clock::now()};
+        sample::gLogInfo << "Engine deserialized in " << duration(deserializeEndTime - deserializeStartTime).count()
+                         << " sec." << std::endl;
+    }
+
+    return mEngine.get();
+}
+
+nvinfer1::ICudaEngine* LazilyDeserializedEngine::release()
 {
-    const auto tensorScales = readScalesFromCalibrationCache(calibrationFile);
-    const bool broadcastInputFormats = broadcastIOFormats(inputFormats, network.getNbInputs());
+    return mEngine.release();
+}
+
+void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector<IOFormat> const& inputFormats,
+    std::vector<IOFormat> const& outputFormats, std::string const& calibrationFile)
+{
+    auto const tensorScales = readScalesFromCalibrationCache(calibrationFile);
+    bool const broadcastInputFormats = broadcastIOFormats(inputFormats, network.getNbInputs());
     for (int32_t i = 0, n = network.getNbInputs(); i < n; ++i)
     {
         int32_t formatIdx = broadcastInputFormats ? 0 : i;
-        if (!inputFormats.empty() && inputFormats[formatIdx].first == nvinfer1::DataType::kINT8)
+        if (!inputFormats.empty() && inputFormats[formatIdx].first == DataType::kINT8)
         {
             auto* input = network.getInput(i);
-            const auto calibScale = tensorScales.at(input->getName());
+            auto const calibScale = tensorScales.at(input->getName());
             input->setDynamicRange(-127 * calibScale, 127 * calibScale);
         }
     }
-    const bool broadcastOutputFormats = broadcastIOFormats(outputFormats, network.getNbInputs());
+    bool const broadcastOutputFormats = broadcastIOFormats(outputFormats, network.getNbOutputs());
     for (int32_t i = 0, n = network.getNbOutputs(); i < n; ++i)
     {
         int32_t formatIdx = broadcastOutputFormats ? 0 : i;
-        if (!outputFormats.empty() && outputFormats[formatIdx].first == nvinfer1::DataType::kINT8)
+        if (!outputFormats.empty() && outputFormats[formatIdx].first == DataType::kINT8)
         {
             auto* output = network.getOutput(i);
-            const auto calibScale = tensorScales.at(output->getName());
+            auto const calibScale = tensorScales.at(output->getName());
             output->setDynamicRange(-127 * calibScale, 127 * calibScale);
         }
     }
 }
 
-#define SMP_RETVAL_IF_FALSE(condition, msg, retval, err)                                                               \
-    {                                                                                                                  \
-        if ((condition) == false)                                                                                      \
-        {                                                                                                              \
-            (err) << (msg) << std::endl;                                                                               \
-            return retval;                                                                                             \
-        }                                                                                                              \
-    }
-
-Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err)
+//!
+//! \brief Generate a network definition for a given model
+//!
+//! \param[in] model Model options for this network
+//! \param[in,out] network Network storing the parsed results
+//! \param[in,out] err Error stream
+//! \param[out] vcPluginLibrariesUsed If not nullptr, will be populated with paths to VC plugin libraries required by
+//! the parsed network.
+//!
+//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid
+//! parser (the returned parser converts to false if tested)
+//!
+//! Constant input dimensions in the model must not be changed in the corresponding
+//! network definition, because its correctness may rely on the constants.
+//!
+//! \see Parser::operator bool()
+//!
+Parser modelToNetwork(ModelOptions const& model, BuildOptions const& build, nvinfer1::INetworkDefinition& network,
+    std::ostream& err, std::vector<std::string>* vcPluginLibrariesUsed)
 {
-    sample::gLogInfo << "Start parsing network model" << std::endl;
+    sample::gLogInfo << "Start parsing network model." << std::endl;
+    auto const tBegin = std::chrono::high_resolution_clock::now();
+
     Parser parser;
-    //const std::string& modelName = model.baseModel.model;
     switch (model.baseModel.format)
     {
     case ModelFormat::kONNX:
     {
         using namespace nvonnxparser;
-        parser.onnxParser.reset(createParser(network, sample::gLogger.getTRTLogger()));
+        parser.onnxParser.reset(createONNXParser(network));
+        ASSERT(parser.onnxParser != nullptr);
+#if !TRT_WINML
+        // kNATIVE_INSTANCENORM is ON by default in the parser and must be cleared to use the plugin implementation.
+        if (build.pluginInstanceNorm)
+        {
+            parser.onnxParser->clearFlag(OnnxParserFlag::kNATIVE_INSTANCENORM);
+        }
+#endif
         if (!parser.onnxParser->parseFromFile(
                 model.baseModel.model.c_str(), static_cast<int>(sample::gLogger.getReportableSeverity())))
         {
             err << "Failed to parse onnx file" << std::endl;
             parser.onnxParser.reset();
         }
+#if !TRT_WINML
+        if (vcPluginLibrariesUsed && parser.onnxParser.get())
+        {
+            int64_t nbPluginLibs;
+            char const* const* pluginLibArray = parser.onnxParser->getUsedVCPluginLibraries(nbPluginLibs);
+            if (nbPluginLibs >= 0)
+            {
+                vcPluginLibrariesUsed->reserve(nbPluginLibs);
+                for (int64_t i = 0; i < nbPluginLibs; ++i)
+                {
+                    sample::gLogInfo << "Using VC plugin library " << pluginLibArray[i] << std::endl;
+                    vcPluginLibrariesUsed->emplace_back(std::string{pluginLibArray[i]});
+                }
+            }
+            else
+            {
+                sample::gLogWarning << "Failure to query VC plugin libraries required by parsed ONNX network"
+                                    << std::endl;
+            }
+        }
+#endif
         break;
     }
-    case ModelFormat::kANY:
-        break;
-    default:
-        break;
+    case ModelFormat::kANY: break;
     }
 
-    sample::gLogInfo << "Finish parsing network model" << std::endl;
+    auto const tEnd = std::chrono::high_resolution_clock::now();
+    float const parseTime = std::chrono::duration<float>(tEnd - tBegin).count();
+
+    sample::gLogInfo << "Finished parsing network model. Parse time: " << parseTime << std::endl;
     return parser;
 }
 
@@ -144,10 +261,10 @@ namespace
 class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2
 {
 public:
-    RndInt8Calibrator(int batches, std::vector<int64_t>& elemCount, const std::string& cacheFile,
-        const nvinfer1::INetworkDefinition& network, std::ostream& err);
+    RndInt8Calibrator(int32_t batches, std::vector<int64_t>& elemCount, std::string const& cacheFile,
+        nvinfer1::INetworkDefinition const& network, std::ostream& err);
 
-    ~RndInt8Calibrator()
+    ~RndInt8Calibrator() override
     {
         for (auto& elem : mInputDeviceBuffers)
         {
@@ -155,28 +272,28 @@ class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2
         }
     }
 
-    bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override;
+    bool getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept override;
 
-    int getBatchSize() const noexcept override
+    int32_t getBatchSize() const noexcept override
     {
         return 1;
     }
 
     const void* readCalibrationCache(size_t& length) noexcept override;
 
-    virtual void writeCalibrationCache(const void*, size_t) noexcept override {}
+    void writeCalibrationCache(void const*, size_t) noexcept override {}
 
 private:
-    int mBatches{};
-    int mCurrentBatch{};
+    int32_t mBatches{};
+    int32_t mCurrentBatch{};
     std::string mCacheFile;
     std::map<std::string, void*> mInputDeviceBuffers;
     std::vector<char> mCalibrationCache;
     std::ostream& mErr;
 };
 
-RndInt8Calibrator::RndInt8Calibrator(int batches, std::vector<int64_t>& elemCount, const std::string& cacheFile,
-    const nvinfer1::INetworkDefinition& network, std::ostream& err)
+RndInt8Calibrator::RndInt8Calibrator(int32_t batches, std::vector<int64_t>& elemCount, std::string const& cacheFile,
+    INetworkDefinition const& network, std::ostream& err)
     : mBatches(batches)
     , mCurrentBatch(0)
     , mCacheFile(cacheFile)
@@ -192,7 +309,7 @@ RndInt8Calibrator::RndInt8Calibrator(int batches, std::vector<int64_t>& elemCoun
     std::uniform_real_distribution<float> distribution(-1.0F, 1.0F);
     auto gen = [&generator, &distribution]() { return distribution(generator); };
 
-    for (int i = 0; i < network.getNbInputs(); i++)
+    for (int32_t i = 0; i < network.getNbInputs(); i++)
     {
         auto* input = network.getInput(i);
         std::vector<float> rnd_data(elemCount[i]);
@@ -206,14 +323,14 @@ RndInt8Calibrator::RndInt8Calibrator(int batches, std::vector<int64_t>& elemCoun
     }
 }
 
-bool RndInt8Calibrator::getBatch(void* bindings[], const char* names[], int nbBindings) noexcept
+bool RndInt8Calibrator::getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept
 {
     if (mCurrentBatch >= mBatches)
     {
         return false;
     }
 
-    for (int i = 0; i < nbBindings; ++i)
+    for (int32_t i = 0; i < nbBindings; ++i)
     {
         bindings[i] = mInputDeviceBuffers[names[i]];
     }
@@ -238,35 +355,35 @@ const void* RndInt8Calibrator::readCalibrationCache(size_t& length) noexcept
     return !mCalibrationCache.empty() ? mCalibrationCache.data() : nullptr;
 }
 
-bool setTensorDynamicRange(const nvinfer1::INetworkDefinition& network, float inRange = 2.0F, float outRange = 4.0F)
+bool setTensorDynamicRange(INetworkDefinition const& network, float inRange = 2.0F, float outRange = 4.0F)
 {
     // Ensure that all layer inputs have a dynamic range.
-    for (int l = 0; l < network.getNbLayers(); l++)
+    for (int32_t l = 0; l < network.getNbLayers(); l++)
     {
         auto* layer = network.getLayer(l);
-        for (int i = 0; i < layer->getNbInputs(); i++)
+        for (int32_t i = 0; i < layer->getNbInputs(); i++)
         {
-            nvinfer1::ITensor* input{layer->getInput(i)};
+            ITensor* input{layer->getInput(i)};
             // Optional inputs are nullptr here and are from RNN layers.
             if (input && !input->dynamicRangeIsSet())
             {
                 // Concat should propagate dynamic range from outputs to inputs to avoid
                 // Re-quantization during the concatenation
-                auto dynRange = (layer->getType() == nvinfer1::LayerType::kCONCATENATION) ? outRange : inRange;
+                auto dynRange = (layer->getType() == LayerType::kCONCATENATION) ? outRange : inRange;
                 if (!input->setDynamicRange(-dynRange, dynRange))
                 {
                     return false;
                 }
             }
         }
-        for (int o = 0; o < layer->getNbOutputs(); o++)
+        for (int32_t o = 0; o < layer->getNbOutputs(); o++)
         {
-            nvinfer1::ITensor* output{layer->getOutput(o)};
+            ITensor* output{layer->getOutput(o)};
             // Optional outputs are nullptr here and are from RNN layers.
             if (output && !output->dynamicRangeIsSet())
             {
                 // Pooling must have the same input and output dynamic range.
-                if (layer->getType() == nvinfer1::LayerType::kPOOLING)
+                if (layer->getType() == LayerType::kPOOLING)
                 {
                     if (!output->setDynamicRange(-inRange, inRange))
                     {
@@ -286,319 +403,43 @@ bool setTensorDynamicRange(const nvinfer1::INetworkDefinition& network, float in
     return true;
 }
 
-// Walk the weights elements and overwrite (at most) 2 out of 4 elements to 0.
-template <typename T>
-void sparsify(const T* values, int64_t count, int32_t k, int32_t rs, std::vector<char>& sparseWeights)
-{
-    const auto c = count / (k * rs);
-    sparseWeights.resize(count * sizeof(T));
-    auto* sparseValues = reinterpret_cast<T*>(sparseWeights.data());
-
-    constexpr int32_t window = 4;
-    constexpr int32_t nonzeros = 2;
-
-    const int32_t crs = c * rs;
-    const auto getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * rs + rsi; };
-
-    for (int64_t ki = 0; ki < k; ++ki)
-    {
-        for (int64_t rsi = 0; rsi < rs; ++rsi)
-        {
-            int32_t w = 0;
-            int32_t nz = 0;
-            for (int64_t ci = 0; ci < c; ++ci)
-            {
-                const auto index = getIndex(ki, ci, rsi);
-                if (nz < nonzeros)
-                {
-                    sparseValues[index] = values[index];
-                    ++nz;
-                }
-                else
-                {
-                    sparseValues[index] = 0;
-                }
-                if (++w == window)
-                {
-                    w = 0;
-                    nz = 0;
-                }
-            }
-        }
-    }
-}
-
-void sparsify(const nvinfer1::Weights& weights, int32_t k, int32_t rs, std::vector<char>& sparseWeights)
-{
-    switch (weights.type)
-    {
-    case nvinfer1::DataType::kFLOAT:
-        sparsify(static_cast<const float*>(weights.values), weights.count, k, rs, sparseWeights);
-        break;
-    case nvinfer1::DataType::kHALF:
-        sparsify(static_cast<const half_float::half*>(weights.values), weights.count, k, rs, sparseWeights);
-        break;
-    case nvinfer1::DataType::kINT8:
-    case nvinfer1::DataType::kINT32:
-    case nvinfer1::DataType::kBOOL: break;
-    }
-}
-
-template <typename L>
-void setSparseWeights(L& l, int32_t k, int32_t rs, std::vector<char>& sparseWeights)
+bool isNonActivationType(nvinfer1::DataType const type)
 {
-    auto weights = l.getKernelWeights();
-    sparsify(weights, k, rs, sparseWeights);
-    weights.values = sparseWeights.data();
-    l.setKernelWeights(weights);
+    return type == nvinfer1::DataType::kINT32 || type == nvinfer1::DataType::kINT64 || type == nvinfer1::DataType::kBOOL
+        || type == nvinfer1::DataType::kUINT8;
 }
 
-template <typename T>
-void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n)
+void setLayerPrecisions(INetworkDefinition& network, LayerPrecisions const& layerPrecisions)
 {
-    ASSERT(dst != src);
-    T* tdst = reinterpret_cast<T*>(dst);
-    T const* tsrc = reinterpret_cast<T const*>(src);
-    for (int32_t mi = 0; mi < m; ++mi)
-    {
-        for (int32_t ni = 0; ni < n; ++ni)
-        {
-            int32_t const isrc = mi * n + ni;
-            int32_t const idst = ni * m + mi;
-            tdst[idst] = tsrc[isrc];
-        }
-    }
-}
-
-// Sparsify the weights of Constant layers that are fed to MatMul via Shuffle layers.
-// Forward analysis on the API graph to determine which weights to sparsify.
-void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector<std::vector<char>>& sparseWeights)
-{
-    using TensorToLayer = std::unordered_map<nvinfer1::ITensor*, nvinfer1::ILayer*>;
-    using LayerToTensor = std::unordered_map<nvinfer1::ILayer*, nvinfer1::ITensor*>;
-
-    // 1. Collect layers and tensors information from the network.
-    TensorToLayer matmulI2L;
-    TensorToLayer constO2L;
-    TensorToLayer shuffleI2L;
-    LayerToTensor shuffleL2O;
-    auto collectMappingInfo = [&](int32_t const idx) {
-        nvinfer1::ILayer* l = network.getLayer(idx);
-        switch (l->getType())
-        {
-        case nvinfer1::LayerType::kMATRIX_MULTIPLY:
-        {
-            // assume weights on the second input.
-            matmulI2L.insert({l->getInput(1), l});
-            break;
-        }
-        case nvinfer1::LayerType::kCONSTANT:
-        {
-            nvinfer1::DataType const dtype = static_cast<nvinfer1::IConstantLayer*>(l)->getWeights().type;
-            if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF)
-            {
-                // Sparsify float only.
-                constO2L.insert({l->getOutput(0), l});
-            }
-            break;
-        }
-        case nvinfer1::LayerType::kSHUFFLE:
-        {
-            shuffleI2L.insert({l->getInput(0), l});
-            shuffleL2O.insert({l, l->getOutput(0)});
-            break;
-        }
-        default: break;
-        }
-    };
-    int32_t const nbLayers = network.getNbLayers();
-    for (int32_t i = 0; i < nbLayers; ++i)
-    {
-        collectMappingInfo(i);
-    }
-    if (matmulI2L.size() == 0 || constO2L.size() == 0)
-    {
-        // No MatrixMultiply or Constant layer found, no weights to sparsify.
-        return;
-    }
-
-    // Helper for analysis
-    auto isTranspose = [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); };
-    auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; };
-    auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool {
-        for (int32_t i = 0; i < dims.nbDims; ++i)
-        {
-            if (dims.d[i] != i || dims.d[i] != -1)
-            {
-                return false;
-            }
-        }
-        return true;
-    };
-    auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose)
-    {
-        while (shuffleI2L.find(t) != shuffleI2L.end())
-        {
-            nvinfer1::IShuffleLayer* s = static_cast<nvinfer1::IShuffleLayer*>(shuffleI2L.at(t));
-            if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions())
-                || !isIdenticalReshape(s->getReshapeDimensions()))
-            {
-                break;
-            }
-
-            if (isTranspose(s->getFirstTranspose()))
-                needTranspose = !needTranspose;
-            if (isTranspose(s->getSecondTranspose()))
-                needTranspose = !needTranspose;
-
-            t = shuffleL2O.at(s);
-        }
-        return t;
-    };
-
-    // 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose
-    std::unordered_map<nvinfer1::IConstantLayer*, bool> constantLayerToSparse;
-    for (auto& o2l : constO2L)
-    {
-        // If need to transpose the weights of the Constant layer.
-        // Need to transpose by default due to semantic difference.
-        bool needTranspose{true};
-        nvinfer1::ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose);
-        if (matmulI2L.find(t) == matmulI2L.end())
-        {
-            continue;
-        }
-
-        // check MatMul params...
-        nvinfer1::IMatrixMultiplyLayer* mm = static_cast<nvinfer1::IMatrixMultiplyLayer*>(matmulI2L.at(t));
-        bool const twoInputs = mm->getNbInputs() == 2;
-        bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions());
-        bool const isSimple
-            = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE && mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR;
-        if (!(twoInputs && all2D && isSimple))
-            continue;
-
-        if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE)
-            needTranspose = !needTranspose;
-
-        constantLayerToSparse.insert({static_cast<nvinfer1::IConstantLayer*>(o2l.second), needTranspose});
-    }
-
-    // 3. Finally, sparsify the weights
-    auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose)
-    {
-        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
-        ASSERT(dims.nbDims == 2);
-        int32_t const idxN = needTranspose ? 1 : 0;
-        int32_t const n = dims.d[idxN];
-        int32_t const k = dims.d[1 - idxN];
-        sparseWeights.emplace_back();
-        std::vector<char>& spw = sparseWeights.back();
-        nvinfer1::Weights w = layer->getWeights();
-        nvinfer1::DataType const dtype = w.type;
-        ASSERT(dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored.
-
-        if (needTranspose)
-        {
-            if (dtype == nvinfer1::DataType::kFLOAT)
-            {
-                spw.resize(w.count * sizeof(float));
-                transpose2DWeights<float>(spw.data(), w.values, k, n);
-            }
-            else if (dtype == nvinfer1::DataType::kHALF)
-            {
-                spw.resize(w.count * sizeof(half_float::half));
-                transpose2DWeights<half_float::half>(spw.data(), w.values, k, n);
-            }
-
-            w.values = spw.data();
-            std::vector<char> tmpW;
-            sparsify(w, n, 1, tmpW);
-
-            if (dtype == nvinfer1::DataType::kFLOAT)
-                transpose2DWeights<float>(spw.data(), tmpW.data(), n, k);
-            else if (dtype == nvinfer1::DataType::kHALF)
-                transpose2DWeights<half_float::half>(spw.data(), tmpW.data(), n, k);
-        }
-        else
-        {
-            sparsify(w, n, 1, spw);
-        }
-
-        w.values = spw.data();
-        layer->setWeights(w);
-    };
-    for (auto& l : constantLayerToSparse)
-    {
-        sparsifyConstantWeights(l.first, l.second);
-    }
-}
-
-void sparsify(nvinfer1::INetworkDefinition& network, std::vector<std::vector<char>>& sparseWeights)
-{
-    for (int32_t l = 0; l < network.getNbLayers(); ++l)
-    {
-        auto* layer = network.getLayer(l);
-        const auto t = layer->getType();
-        if (t == nvinfer1::LayerType::kCONVOLUTION)
-        {
-            auto& conv = *static_cast<nvinfer1::IConvolutionLayer*>(layer);
-            const auto& dims = conv.getKernelSizeNd();
-            if (dims.nbDims > 2)
-            {
-                continue;
-            }
-            const auto k = conv.getNbOutputMaps();
-            const auto rs = dims.d[0] * dims.d[1];
-            sparseWeights.emplace_back();
-            setSparseWeights(conv, k, rs, sparseWeights.back());
-        }
-        else if (t == nvinfer1::LayerType::kFULLY_CONNECTED)
-        {
-            auto& fc = *static_cast<nvinfer1::IFullyConnectedLayer*>(layer);
-            const auto k = fc.getNbOutputChannels();
-            sparseWeights.emplace_back();
-            setSparseWeights(fc, k, 1, sparseWeights.back());
-        }
-    }
-
-    sparsifyMatMulKernelWeights(network, sparseWeights);
-}
-
-void setLayerPrecisions(nvinfer1::INetworkDefinition& network, LayerPrecisions const& layerPrecisions)
-{
-    bool const hasGlobalPrecision{layerPrecisions.find("*") != layerPrecisions.end()};
-    auto const globalPrecision = hasGlobalPrecision ? layerPrecisions.at("*") : nvinfer1::DataType::kFLOAT;
     bool hasLayerPrecisionSkipped{false};
     for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx)
     {
         auto* layer = network.getLayer(layerIdx);
         auto const layerName = layer->getName();
-        if (layerPrecisions.find(layer->getName()) != layerPrecisions.end())
+        auto exactMatch = layerPrecisions.find(layerName);
+        auto plausibleMatch = findPlausible(layerPrecisions, layerName);
+        if (exactMatch != layerPrecisions.end())
         {
-            layer->setPrecision(layerPrecisions.at(layer->getName()));
+            sample::gLogInfo << "Set layer " << layerName << " to precision " << exactMatch->second << std::endl;
+            layer->setPrecision(exactMatch->second);
         }
-        else if (hasGlobalPrecision)
+        else if (plausibleMatch != layerPrecisions.end())
         {
-            // We should not set the layer precision if its default precision is INT32 or Bool.
-            if (layer->getPrecision() == nvinfer1::DataType::kINT32
-                || layer->getPrecision() == nvinfer1::DataType::kBOOL)
+            if (isNonActivationType(layer->getPrecision()))
             {
                 hasLayerPrecisionSkipped = true;
                 sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because the "
-                                    << " default layer precision is INT32 or Bool." << std::endl;
+                                    << " default layer precision is of non-activation type." << std::endl;
                 continue;
             }
-            // We should not set the constant layer precision if its weights are in INT32.
             if (layer->getType() == nvinfer1::LayerType::kCONSTANT
-                && static_cast<nvinfer1::IConstantLayer*>(layer)->getWeights().type == nvinfer1::DataType::kINT32)
+                && (isNonActivationType(static_cast<IConstantLayer*>(layer)->getWeights().type)))
             {
                 hasLayerPrecisionSkipped = true;
                 sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this "
-                                    << "constant layer has INT32 weights." << std::endl;
+                                    << "constant layer has weights of non-activation type." << std::endl;
                 continue;
             }
-            // We should not set the layer precision if the layer operates on a shape tensor.
             if (layer->getNbInputs() >= 1 && layer->getInput(0)->isShapeTensor())
             {
                 hasLayerPrecisionSkipped = true;
@@ -606,18 +447,17 @@ void setLayerPrecisions(nvinfer1::INetworkDefinition& network, LayerPrecisions c
                                     << "operates on a shape tensor." << std::endl;
                 continue;
             }
-            if ((layer->getType() == nvinfer1::LayerType::kIDENTITY
-                    || layer->getType() == nvinfer1::LayerType::kSHUFFLE)
-                && layer->getNbInputs() >= 1 && layer->getInput(0)->getType() == nvinfer1::DataType::kINT32
-                && layer->getNbOutputs() >= 1 && layer->getOutput(0)->getType() == nvinfer1::DataType::kINT32)
+            if (layer->getNbInputs() >= 1 && isNonActivationType(layer->getInput(0)->getType())
+                && layer->getNbOutputs() >= 1 && isNonActivationType(layer->getOutput(0)->getType()))
             {
                 hasLayerPrecisionSkipped = true;
                 sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this "
-                                    << "layer has INT32 input and output." << std::endl;
+                                    << "layer has input and output of non-activation type." << std::endl;
                 continue;
             }
             // All heuristics passed. Set the layer precision.
-            layer->setPrecision(globalPrecision);
+            sample::gLogInfo << "Set layer " << layerName << " to precision " << plausibleMatch->second << std::endl;
+            layer->setPrecision(plausibleMatch->second);
         }
     }
 
@@ -628,7 +468,7 @@ void setLayerPrecisions(nvinfer1::INetworkDefinition& network, LayerPrecisions c
     }
 }
 
-void setLayerOutputTypes(nvinfer1::INetworkDefinition& network, LayerOutputTypes const& layerOutputTypes)
+void setLayerOutputTypes(INetworkDefinition& network, LayerOutputTypes const& layerOutputTypes)
 {
     bool const hasGlobalOutputType{layerOutputTypes.find("*") != layerOutputTypes.end()};
     auto const globalOutputType = hasGlobalOutputType ? layerOutputTypes.at("*").at(0) : nvinfer1::DataType::kFLOAT;
@@ -638,9 +478,11 @@ void setLayerOutputTypes(nvinfer1::INetworkDefinition& network, LayerOutputTypes
         auto* layer = network.getLayer(layerIdx);
         auto const layerName = layer->getName();
         auto const nbOutputs = layer->getNbOutputs();
-        if (layerOutputTypes.find(layer->getName()) != layerOutputTypes.end())
+        auto exactMatch = layerOutputTypes.find(layerName);
+        auto plausibleMatch = findPlausible(layerOutputTypes, layerName);
+        if (exactMatch != layerOutputTypes.end())
         {
-            auto const& outputTypes = layerOutputTypes.at(layer->getName());
+            auto const& outputTypes = exactMatch->second;
             bool const isBroadcast = (outputTypes.size() == 1);
             if (!isBroadcast && static_cast<int32_t>(outputTypes.size()) != nbOutputs)
             {
@@ -651,11 +493,17 @@ void setLayerOutputTypes(nvinfer1::INetworkDefinition& network, LayerOutputTypes
             }
             for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx)
             {
-                layer->setOutputType(outputIdx, outputTypes.at(isBroadcast ? 0 : outputIdx));
+                auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx);
+                sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType
+                                 << std::endl;
+                layer->setOutputType(outputIdx, outputType);
             }
         }
-        else if (hasGlobalOutputType)
+        else if (plausibleMatch != layerOutputTypes.end())
         {
+            auto const& outputTypes = plausibleMatch->second;
+            bool const isBroadcast = (outputTypes.size() == 1);
+
             // We should not set the layer output types if its default precision is INT32 or Bool.
             if (layer->getPrecision() == nvinfer1::DataType::kINT32
                 || layer->getPrecision() == nvinfer1::DataType::kBOOL)
@@ -667,7 +515,7 @@ void setLayerOutputTypes(nvinfer1::INetworkDefinition& network, LayerOutputTypes
             }
             // We should not set the constant layer output types if its weights are in INT32.
             if (layer->getType() == nvinfer1::LayerType::kCONSTANT
-                && static_cast<nvinfer1::IConstantLayer*>(layer)->getWeights().type == nvinfer1::DataType::kINT32)
+                && static_cast<IConstantLayer*>(layer)->getWeights().type == nvinfer1::DataType::kINT32)
             {
                 hasLayerOutputTypeSkipped = true;
                 sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because this "
@@ -684,6 +532,10 @@ void setLayerOutputTypes(nvinfer1::INetworkDefinition& network, LayerOutputTypes
                                         << layerName << " because it is a shape tensor." << std::endl;
                     continue;
                 }
+
+                auto const outputType = outputTypes.at(isBroadcast ? 0 : outputIdx);
+                sample::gLogInfo << "Set output " << outputIdx << " of layer " << layerName << " to type " << outputType
+                                 << std::endl;
                 layer->setOutputType(outputIdx, globalOutputType);
             }
         }
@@ -696,45 +548,129 @@ void setLayerOutputTypes(nvinfer1::INetworkDefinition& network, LayerOutputTypes
     }
 }
 
-void setMemoryPoolLimits(nvinfer1::IBuilderConfig& config, BuildOptions const& build)
+void setLayerDeviceTypes(
+    INetworkDefinition const& network, IBuilderConfig& config, LayerDeviceTypes const& layerDeviceTypes)
 {
-    auto const roundToBytes = [](double const sizeInMB) { return static_cast<size_t>(sizeInMB * (1 << 20)); };
+    for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx)
+    {
+        auto* layer = network.getLayer(layerIdx);
+        auto const layerName = layer->getName();
+        auto match = findPlausible(layerDeviceTypes, layerName);
+        if (match != layerDeviceTypes.end())
+        {
+            DeviceType const deviceType = match->second;
+            sample::gLogInfo << "Set layer " << layerName << " to device type " << (int)deviceType << std::endl;
+            config.setDeviceType(layer, deviceType);
+        }
+    }
+}
+
+void markDebugTensors(INetworkDefinition& network, StringSet const& debugTensors)
+{
+    for (int64_t inputIndex = 0; inputIndex < network.getNbInputs(); ++inputIndex)
+    {
+        auto* t = network.getInput(inputIndex);
+        auto const tensorName = t->getName();
+        if (debugTensors.count(tensorName) > 0)
+        {
+            network.markDebug(*t);
+        }
+    }
+    for (int64_t layerIndex = 0; layerIndex < network.getNbLayers(); ++layerIndex)
+    {
+        auto* layer = network.getLayer(layerIndex);
+        for (int64_t outputIndex = 0; outputIndex < layer->getNbOutputs(); ++outputIndex)
+        {
+            auto* t = layer->getOutput(outputIndex);
+            auto const tensorName = t->getName();
+            if (debugTensors.count(tensorName) > 0)
+            {
+                network.markDebug(*t);
+            }
+        }
+    }
+}
+
+void setMemoryPoolLimits(IBuilderConfig& config, BuildOptions const& build)
+{
+    auto const roundToBytes = [](double const size, bool fromMB = true) {
+        return static_cast<size_t>(size * (fromMB ? 1.0_MiB : 1.0_KiB));
+    };
     if (build.workspace >= 0)
-        config.setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, roundToBytes(build.workspace));
+    {
+        config.setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, roundToBytes(build.workspace));
+    }
     if (build.dlaSRAM >= 0)
-        config.setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM, roundToBytes(build.dlaSRAM));
+    {
+        size_t const sizeInBytes = roundToBytes(build.dlaSRAM);
+        size_t sizeInPowerOf2{1};
+        // Using 2^30 bytes as a loose upper bound to prevent the possibility of overflows and infinite loops.
+        while (sizeInPowerOf2 < 31 && (static_cast<size_t>(1) << sizeInPowerOf2) <= sizeInBytes)
+        {
+            ++sizeInPowerOf2;
+        }
+        --sizeInPowerOf2;
+        if (sizeInPowerOf2 == 30)
+        {
+            sample::gLogWarning
+                << "User-specified DLA managed SRAM size is too large and has been clipped to 2^30 bytes. "
+                << "Please make sure that this is the intended managed SRAM size." << std::endl;
+        }
+        config.setMemoryPoolLimit(MemoryPoolType::kDLA_MANAGED_SRAM, static_cast<size_t>(1) << sizeInPowerOf2);
+    }
     if (build.dlaLocalDRAM >= 0)
-        config.setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM, roundToBytes(build.dlaLocalDRAM));
+    {
+        config.setMemoryPoolLimit(MemoryPoolType::kDLA_LOCAL_DRAM, roundToBytes(build.dlaLocalDRAM));
+    }
     if (build.dlaGlobalDRAM >= 0)
-        config.setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM, roundToBytes(build.dlaGlobalDRAM));
+    {
+        config.setMemoryPoolLimit(MemoryPoolType::kDLA_GLOBAL_DRAM, roundToBytes(build.dlaGlobalDRAM));
+    }
+    if (build.tacticSharedMem >= 0)
+    {
+        config.setMemoryPoolLimit(MemoryPoolType::kTACTIC_SHARED_MEMORY, roundToBytes(build.tacticSharedMem, false));
+    }
+}
+
+void setPreviewFeatures(IBuilderConfig& config, BuildOptions const& build)
+{
+    auto const setFlag = [&](PreviewFeature feat) {
+        int32_t featVal = static_cast<int32_t>(feat);
+        if (build.previewFeatures.find(featVal) != build.previewFeatures.end())
+        {
+            config.setPreviewFeature(feat, build.previewFeatures.at(featVal));
+        }
+    };
+    setFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03);
 }
 
 } // namespace
 
-bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder,
-    nvinfer1::INetworkDefinition& network, nvinfer1::IBuilderConfig& config, std::ostream& err,
-    std::vector<std::vector<char>>& sparseWeights)
+bool setupNetworkAndConfig(BuildOptions const& build, SystemOptions const& sys, IBuilder& builder,
+    INetworkDefinition& network, IBuilderConfig& config, std::unique_ptr<nvinfer1::IInt8Calibrator>& calibrator,
+    std::ostream& err, std::vector<std::vector<int8_t>>& sparseWeights)
 {
-    nvinfer1::IOptimizationProfile* profile{nullptr};
-    if (build.maxBatch)
-        builder.setMaxBatchSize(build.maxBatch);
-    else
+    std::vector<IOptimizationProfile*> profiles{};
+    profiles.resize(build.optProfiles.size());
+    for (auto& profile : profiles)
+    {
         profile = builder.createOptimizationProfile();
+    }
 
     bool hasDynamicShapes{false};
 
     bool broadcastInputFormats = broadcastIOFormats(build.inputFormats, network.getNbInputs());
 
-    if (profile)
+    // Check if the provided input tensor names match the input tensors of the engine.
+    // Throw an error if the provided input tensor names cannot be found because it implies a potential typo.
+    for (auto const& shapes : build.optProfiles)
     {
-        // Check if the provided input tensor names match the input tensors of the engine.
-        // Throw an error if the provided input tensor names cannot be found because it implies a potential typo.
-        for (const auto& shape : build.shapes)
+        for (auto const& shape : shapes)
         {
             bool tensorNameFound{false};
             for (int32_t i = 0; i < network.getNbInputs(); ++i)
             {
-                if (network.getInput(i)->getName() == shape.first)
+                if (matchStringWithOneWildcard(shape.first, network.getInput(i)->getName()))
                 {
                     tensorNameFound = true;
                     break;
@@ -755,45 +691,31 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
         auto* input = network.getInput(i);
         if (!build.inputFormats.empty())
         {
-            int inputFormatIndex = broadcastInputFormats ? 0 : i;
+            int32_t inputFormatIndex = broadcastInputFormats ? 0 : i;
             input->setType(build.inputFormats[inputFormatIndex].first);
             input->setAllowedFormats(build.inputFormats[inputFormatIndex].second);
         }
-        else
-        {
-            switch (input->getType())
-            {
-            case nvinfer1::DataType::kINT32:
-            case nvinfer1::DataType::kBOOL:
-            case nvinfer1::DataType::kHALF:
-                // Leave these as is.
-                break;
-            case nvinfer1::DataType::kFLOAT:
-            case nvinfer1::DataType::kINT8:
-                // User did not specify a floating-point format.  Default to kFLOAT.
-                input->setType(nvinfer1::DataType::kFLOAT);
-                break;
-            }
-            input->setAllowedFormats(1U << static_cast<int>(nvinfer1::TensorFormat::kLINEAR));
-        }
 
-        if (profile)
+        auto const dims = input->getDimensions();
+        auto const isScalar = dims.nbDims == 0;
+        auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; })
+            || input->isShapeTensor();
+        if (isDynamicInput)
         {
-            auto const dims = input->getDimensions();
-            auto const isScalar = dims.nbDims == 0;
-            auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; })
-                || input->isShapeTensor();
-            if (isDynamicInput)
+            hasDynamicShapes = true;
+            for (size_t i = 0; i < build.optProfiles.size(); i++)
             {
-                hasDynamicShapes = true;
-                auto shape = build.shapes.find(input->getName());
+                auto const& optShapes = build.optProfiles[i];
+                auto profile = profiles[i];
+                auto const tensorName = input->getName();
+                auto shape = findPlausible(optShapes, tensorName);
                 ShapeRange shapes{};
 
                 // If no shape is provided, set dynamic dimensions to 1.
-                if (shape == build.shapes.end())
+                if (shape == optShapes.end())
                 {
-                    constexpr int DEFAULT_DIMENSION = 1;
-                    std::vector<int> staticDims;
+                    constexpr int32_t kDEFAULT_DIMENSION{1};
+                    std::vector<int32_t> staticDims;
                     if (input->isShapeTensor())
                     {
                         if (isScalar)
@@ -803,16 +725,16 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
                         else
                         {
                             staticDims.resize(dims.d[0]);
-                            std::fill(staticDims.begin(), staticDims.end(), DEFAULT_DIMENSION);
+                            std::fill(staticDims.begin(), staticDims.end(), kDEFAULT_DIMENSION);
                         }
                     }
                     else
                     {
                         staticDims.resize(dims.nbDims);
                         std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(),
-                            [&](int dimension) { return dimension > 0 ? dimension : DEFAULT_DIMENSION; });
+                            [&](int dimension) { return dimension > 0 ? dimension : kDEFAULT_DIMENSION; });
                     }
-                    sample::gLogWarning << "Dynamic dimensions required for input: " << input->getName()
+                    sample::gLogWarning << "Dynamic dimensions required for input: " << tensorName
                                         << ", but no shapes were provided. Automatically overriding shape to: "
                                         << staticDims << std::endl;
                     std::fill(shapes.begin(), shapes.end(), staticDims);
@@ -825,39 +747,62 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
                 std::vector<int> profileDims{};
                 if (input->isShapeTensor())
                 {
-                    profileDims = shapes[static_cast<size_t>(nvinfer1::OptProfileSelector::kMIN)];
-                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), nvinfer1::OptProfileSelector::kMIN,
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMIN)];
+                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kMIN,
                                             profileDims.data(), static_cast<int>(profileDims.size())),
                         "Error in set shape values MIN", false, err);
-                    profileDims = shapes[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)];
-                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), nvinfer1::OptProfileSelector::kOPT,
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kOPT)];
+                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kOPT,
                                             profileDims.data(), static_cast<int>(profileDims.size())),
                         "Error in set shape values OPT", false, err);
-                    profileDims = shapes[static_cast<size_t>(nvinfer1::OptProfileSelector::kMAX)];
-                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), nvinfer1::OptProfileSelector::kMAX,
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMAX)];
+                    SMP_RETVAL_IF_FALSE(profile->setShapeValues(tensorName, OptProfileSelector::kMAX,
                                             profileDims.data(), static_cast<int>(profileDims.size())),
                         "Error in set shape values MAX", false, err);
+                    sample::gLogInfo << "Set input shape tensor " << tensorName << " for optimization profile " << i
+                                     << " to:"
+                                     << " MIN=" << shapes[static_cast<size_t>(OptProfileSelector::kMIN)]
+                                     << " OPT=" << shapes[static_cast<size_t>(OptProfileSelector::kOPT)]
+                                     << " MAX=" << shapes[static_cast<size_t>(OptProfileSelector::kMAX)] << std::endl;
                 }
                 else
                 {
-                    profileDims = shapes[static_cast<size_t>(nvinfer1::OptProfileSelector::kMIN)];
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMIN)];
                     SMP_RETVAL_IF_FALSE(
-                        profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, toDims(profileDims)),
+                        profile->setDimensions(tensorName, OptProfileSelector::kMIN, toDims(profileDims)),
                         "Error in set dimensions to profile MIN", false, err);
-                    profileDims = shapes[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)];
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kOPT)];
                     SMP_RETVAL_IF_FALSE(
-                        profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, toDims(profileDims)),
+                        profile->setDimensions(tensorName, OptProfileSelector::kOPT, toDims(profileDims)),
                         "Error in set dimensions to profile OPT", false, err);
-                    profileDims = shapes[static_cast<size_t>(nvinfer1::OptProfileSelector::kMAX)];
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMAX)];
                     SMP_RETVAL_IF_FALSE(
-                        profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, toDims(profileDims)),
+                        profile->setDimensions(tensorName, OptProfileSelector::kMAX, toDims(profileDims)),
                         "Error in set dimensions to profile MAX", false, err);
+                    sample::gLogInfo << "Set shape of input tensor " << tensorName << " for optimization profile " << i
+                                     << " to:"
+                                     << " MIN=" << shapes[static_cast<size_t>(OptProfileSelector::kMIN)]
+                                     << " OPT=" << shapes[static_cast<size_t>(OptProfileSelector::kOPT)]
+                                     << " MAX=" << shapes[static_cast<size_t>(OptProfileSelector::kMAX)] << std::endl;
                 }
             }
         }
     }
 
-    if (!hasDynamicShapes && !build.shapes.empty())
+    for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++)
+    {
+        auto* output = network.getOutput(i);
+        auto const dims = output->getDimensions();
+        // A shape tensor output with known static dimensions may have dynamic shape values inside it.
+        auto const isDynamicOutput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; })
+            || output->isShapeTensor();
+        if (isDynamicOutput)
+        {
+            hasDynamicShapes = true;
+        }
+    }
+
+    if (!hasDynamicShapes && !build.optProfiles[0].empty())
     {
         sample::gLogError << "Static model does not take explicit shapes since the shape of inference tensors will be "
                              "determined by the model itself"
@@ -865,10 +810,14 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
         return false;
     }
 
-    if (profile && hasDynamicShapes)
+    if (hasDynamicShapes)
     {
-        SMP_RETVAL_IF_FALSE(profile->isValid(), "Required optimization profile is invalid", false, err);
-        SMP_RETVAL_IF_FALSE(config.addOptimizationProfile(profile) != -1, "Error in add optimization profile", false, err);
+        for (auto profile : profiles)
+        {
+            SMP_RETVAL_IF_FALSE(profile->isValid(), "Required optimization profile is invalid", false, err);
+            SMP_RETVAL_IF_FALSE(
+                config.addOptimizationProfile(profile) != -1, "Error in add optimization profile", false, err);
+        }
     }
 
     bool broadcastOutputFormats = broadcastIOFormats(build.outputFormats, network.getNbOutputs(), false);
@@ -879,43 +828,118 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
         auto* output = network.getOutput(i);
         if (!build.outputFormats.empty())
         {
-            int outputFormatIndex = broadcastOutputFormats ? 0 : i;
+            int32_t outputFormatIndex = broadcastOutputFormats ? 0 : i;
             output->setType(build.outputFormats[outputFormatIndex].first);
             output->setAllowedFormats(build.outputFormats[outputFormatIndex].second);
         }
-        else
-        {
-            output->setAllowedFormats(1U << static_cast<int>(nvinfer1::TensorFormat::kLINEAR));
-        }
     }
 
     setMemoryPoolLimits(config, build);
 
+    setPreviewFeatures(config, build);
+
+    if (build.builderOptimizationLevel != defaultBuilderOptimizationLevel)
+    {
+        config.setBuilderOptimizationLevel(build.builderOptimizationLevel);
+    }
+
+    if (build.maxTactics != defaultMaxTactics)
+    {
+#if (NV_TENSORRT_MAJOR < 9)
+        config.setMaxNbTactics(build.maxTactics);
+#else
+        config.setTacticSources(build.maxTactics);
+#endif
+    }
+
     if (build.timingCacheMode == TimingCacheMode::kDISABLE)
-        config.setFlag(nvinfer1::BuilderFlag::kDISABLE_TIMING_CACHE);
+    {
+        config.setFlag(BuilderFlag::kDISABLE_TIMING_CACHE);
+    }
+
+    if (build.disableCompilationCache)
+    {
+        config.setFlag(BuilderFlag::kDISABLE_COMPILATION_CACHE);
+    }
+
+    if (build.errorOnTimingCacheMiss)
+    {
+        config.setFlag(BuilderFlag::kERROR_ON_TIMING_CACHE_MISS);
+    }
 
     if (!build.tf32)
-        config.clearFlag(nvinfer1::BuilderFlag::kTF32);
+    {
+        config.clearFlag(BuilderFlag::kTF32);
+    }
 
     if (build.refittable)
-        config.setFlag(nvinfer1::BuilderFlag::kREFIT);
+    {
+        config.setFlag(BuilderFlag::kREFIT);
+    }
+
+    if (build.stripWeights)
+    {
+        // The kREFIT_IDENTICAL is enabled by default when kSTRIP_PLAN is on.
+        config.setFlag(BuilderFlag::kSTRIP_PLAN);
+    }
+
+    if (build.versionCompatible)
+    {
+        config.setFlag(BuilderFlag::kVERSION_COMPATIBLE);
+    }
+#if !TRT_WINML
+    std::vector<char const*> pluginPaths;
+    for (auto const& pluginPath : sys.setPluginsToSerialize)
+    {
+        sample::gLogVerbose << "Setting plugin to serialize: " << pluginPath << std::endl;
+        pluginPaths.push_back(pluginPath.c_str());
+    }
+    if (!pluginPaths.empty())
+    {
+        config.setPluginsToSerialize(pluginPaths.data(), pluginPaths.size());
+    }
+#endif
+    if (build.excludeLeanRuntime)
+    {
+        config.setFlag(BuilderFlag::kEXCLUDE_LEAN_RUNTIME);
+    }
 
     if (build.sparsity != SparsityFlag::kDISABLE)
     {
-        config.setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+        config.setFlag(BuilderFlag::kSPARSE_WEIGHTS);
         if (build.sparsity == SparsityFlag::kFORCE)
+        {
             sparsify(network, sparseWeights);
+        }
     }
 
     config.setProfilingVerbosity(build.profilingVerbosity);
-    config.setMinTimingIterations(build.minTiming);
     config.setAvgTimingIterations(build.avgTiming);
 
     if (build.fp16)
-        config.setFlag(nvinfer1::BuilderFlag::kFP16);
-
+    {
+        config.setFlag(BuilderFlag::kFP16);
+    }
     if (build.int8)
-        config.setFlag(nvinfer1::BuilderFlag::kINT8);
+    {
+        config.setFlag(BuilderFlag::kINT8);
+    }
+    if (build.bf16)
+    {
+        config.setFlag(BuilderFlag::kBF16);
+    }
+
+    SMP_RETVAL_IF_FALSE(!(build.int8 && build.fp8), "FP8 and INT8 precisions have been specified", false, err);
+
+    if (build.fp8)
+    {
+        config.setFlag(BuilderFlag::kFP8);
+    }
+
+    if (build.int4)
+    {
+        config.setFlag(BuilderFlag::kINT4);
+    }
 
     if (build.int8 && !build.fp16)
     {
@@ -925,18 +949,20 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
             << std::endl;
     }
 
-    auto isInt8 = [](const IOFormat& format) { return format.first == nvinfer1::DataType::kINT8; };
+    auto isInt8 = [](const IOFormat& format) { return format.first == DataType::kINT8; };
     auto int8IO = std::count_if(build.inputFormats.begin(), build.inputFormats.end(), isInt8)
         + std::count_if(build.outputFormats.begin(), build.outputFormats.end(), isInt8);
 
-    auto hasQDQLayers = [](nvinfer1::INetworkDefinition& network) {
+    auto hasQDQLayers = [](INetworkDefinition& network) {
         // Determine if our network has QDQ layers.
-        const auto nbLayers = network.getNbLayers();
+        auto const nbLayers = network.getNbLayers();
         for (int32_t i = 0; i < nbLayers; i++)
         {
-            const auto& layer = network.getLayer(i);
-            if (layer->getType() == nvinfer1::LayerType::kQUANTIZE || layer->getType() == nvinfer1::LayerType::kDEQUANTIZE)
+            auto const& layer = network.getLayer(i);
+            if (layer->getType() == LayerType::kQUANTIZE || layer->getType() == LayerType::kDEQUANTIZE)
+            {
                 return true;
+            }
         }
         return false;
     };
@@ -965,28 +991,37 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
                 return false;
             }
         }
-        nvinfer1::IOptimizationProfile* profileCalib{nullptr};
+        IOptimizationProfile* profileCalib{nullptr};
         if (!build.shapesCalib.empty())
         {
             profileCalib = builder.createOptimizationProfile();
             for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++)
             {
                 auto* input = network.getInput(i);
-                nvinfer1::Dims profileDims{};
-                auto shape = build.shapesCalib.find(input->getName());
-                ShapeRange shapesCalib{};
-                shapesCalib = shape->second;
+                Dims profileDims{};
+                auto const tensorName = input->getName();
+                auto shape = findPlausible(build.shapesCalib, tensorName);
 
-                profileDims = toDims(shapesCalib[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+                if (shape == build.shapesCalib.end())
+                {
+                    std::ostringstream msg;
+                    msg << "Calibration profile for tensor " << tensorName << " cannot be found!";
+                    throw std::invalid_argument(msg.str());
+                }
+
+                auto shapesCalib = shape->second;
+                profileDims = toDims(shapesCalib[static_cast<size_t>(OptProfileSelector::kOPT)]);
                 // Here we check only kMIN as all profileDims are the same.
-                SMP_RETVAL_IF_FALSE(
-                    profileCalib->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, profileDims),
+                SMP_RETVAL_IF_FALSE(profileCalib->setDimensions(tensorName, OptProfileSelector::kMIN, profileDims),
                     "Error in set dimensions to calibration profile OPT", false, err);
-                profileCalib->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, profileDims);
-                profileCalib->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, profileDims);
+                profileCalib->setDimensions(tensorName, OptProfileSelector::kOPT, profileDims);
+                profileCalib->setDimensions(tensorName, OptProfileSelector::kMAX, profileDims);
+                sample::gLogInfo << "Set calibration profile for input tensor " << tensorName << " to " << profileDims
+                                 << std::endl;
             }
             SMP_RETVAL_IF_FALSE(profileCalib->isValid(), "Calibration profile is invalid", false, err);
-            SMP_RETVAL_IF_FALSE(config.setCalibrationProfile(profileCalib), "Error in set calibration profile", false, err);
+            SMP_RETVAL_IF_FALSE(
+                config.setCalibrationProfile(profileCalib), "Error in set calibration profile", false, err);
         }
 
         std::vector<int64_t> elemCount{};
@@ -994,59 +1029,96 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
         {
             auto* input = network.getInput(i);
             auto const dims = input->getDimensions();
-            auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; });
+            auto const isDynamicInput
+                = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; });
 
             if (profileCalib)
-                elemCount.push_back(volume(profileCalib->getDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT)));
-            else if (profile && isDynamicInput)
-                elemCount.push_back(volume(profile->getDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT)));
+            {
+                elemCount.push_back(volume(profileCalib->getDimensions(input->getName(), OptProfileSelector::kOPT)));
+            }
+            else if (!profiles.empty() && isDynamicInput)
+            {
+                elemCount.push_back(
+                    volume(profiles[build.calibProfile]->getDimensions(input->getName(), OptProfileSelector::kOPT)));
+            }
             else
+            {
                 elemCount.push_back(volume(input->getDimensions()));
+            }
         }
 
-        config.setInt8Calibrator(new RndInt8Calibrator(1, elemCount, build.calibration, network, err));
+        calibrator.reset(new RndInt8Calibrator(1, elemCount, build.calibration, network, err));
+        config.setInt8Calibrator(calibrator.get());
     }
 
     if (build.directIO)
-        config.setFlag(nvinfer1::BuilderFlag::kDIRECT_IO);
+    {
+        config.setFlag(BuilderFlag::kDIRECT_IO);
+    }
 
     switch (build.precisionConstraints)
     {
     case PrecisionConstraints::kNONE:
         // It's the default for TensorRT.
         break;
-    case PrecisionConstraints::kOBEY:
-        config.setFlag(nvinfer1::BuilderFlag::kOBEY_PRECISION_CONSTRAINTS);
-        break;
-    case PrecisionConstraints::kPREFER: config.setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); break;
+    case PrecisionConstraints::kOBEY: config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); break;
+    case PrecisionConstraints::kPREFER: config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); break;
     }
 
     if (!build.layerPrecisions.empty() && build.precisionConstraints != PrecisionConstraints::kNONE)
+    {
         setLayerPrecisions(network, build.layerPrecisions);
+    }
 
     if (!build.layerOutputTypes.empty() && build.precisionConstraints != PrecisionConstraints::kNONE)
+    {
         setLayerOutputTypes(network, build.layerOutputTypes);
+    }
+
+    if (!build.layerDeviceTypes.empty())
+    {
+        setLayerDeviceTypes(network, config, build.layerDeviceTypes);
+    }
 
-    if (build.safe)
-        config.setEngineCapability(sys.DLACore != -1 ? nvinfer1::EngineCapability::kDLA_STANDALONE : nvinfer1::EngineCapability::kSAFETY);
+    if (!build.debugTensors.empty())
+    {
+        markDebugTensors(network, build.debugTensors);
+    }
+
+    if (build.safe && sys.DLACore == -1)
+    {
+        config.setEngineCapability(EngineCapability::kSAFETY);
+    }
 
     if (build.restricted)
-        config.setFlag(nvinfer1::BuilderFlag::kSAFETY_SCOPE);
+    {
+        config.setFlag(BuilderFlag::kSAFETY_SCOPE);
+    }
 
     if (sys.DLACore != -1)
     {
         if (sys.DLACore < builder.getNbDLACores())
         {
-            config.setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+            config.setDefaultDeviceType(DeviceType::kDLA);
             config.setDLACore(sys.DLACore);
-            config.setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
-
-            if (sys.fallback)
-                config.setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-            else // Reformatting runs on GPU, so avoid I/O reformatting
-                config.setFlag(nvinfer1::BuilderFlag::kDIRECT_IO);
+            config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+            if (build.buildDLAStandalone)
+            {
+                config.setEngineCapability(EngineCapability::kDLA_STANDALONE);
+            }
+            if (build.allowGPUFallback)
+            {
+                config.setFlag(BuilderFlag::kGPU_FALLBACK);
+            }
+            else
+            {
+                // Reformatting runs on GPU, so avoid I/O reformatting.
+                config.setFlag(BuilderFlag::kDIRECT_IO);
+            }
             if (!build.int8)
-                config.setFlag(nvinfer1::BuilderFlag::kFP16);
+            {
+                config.setFlag(BuilderFlag::kFP16);
+            }
         }
         else
         {
@@ -1057,37 +1129,50 @@ bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys,
 
     if (build.enabledTactics || build.disabledTactics)
     {
-        nvinfer1::TacticSources tacticSources = config.getTacticSources();
+        TacticSources tacticSources = config.getTacticSources();
         tacticSources |= build.enabledTactics;
         tacticSources &= ~build.disabledTactics;
         config.setTacticSources(tacticSources);
     }
 
+    config.setHardwareCompatibilityLevel(build.hardwareCompatibilityLevel);
+    config.setRuntimePlatform(build.runtimePlatform);
+
+    if (build.maxAuxStreams != defaultMaxAuxStreams)
+    {
+        config.setMaxAuxStreams(build.maxAuxStreams);
+    }
+
+    if (build.allowWeightStreaming)
+    {
+        config.setFlag(BuilderFlag::kWEIGHT_STREAMING);
+    }
+
     return true;
 }
 
 //!
-//! \brief Create an engine for a network defintion
+//! \brief Create a serialized engine for a network defintion
 //!
-//! \return Pointer to the engine created or nullptr if the creation failed
+//! \return Whether the engine creation succeeds or fails.
 //!
-bool networkToEngine(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder,
-    BuildEnvironment& env, std::ostream& err)
+bool networkToSerializedEngine(
+    BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, BuildEnvironment& env, std::ostream& err)
 {
-    TrtUniquePtr<nvinfer1::IBuilderConfig> config{builder.createBuilderConfig()};
-    std::vector<std::vector<char>> sparseWeights;
+    std::unique_ptr<IBuilderConfig> config{builder.createBuilderConfig()};
+    std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator;
+    std::vector<std::vector<int8_t>> sparseWeights;
     SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", false, err);
-    SMP_RETVAL_IF_FALSE(setupNetworkAndConfig(build, sys, builder, *env.network, *config, err, sparseWeights),
+    SMP_RETVAL_IF_FALSE(
+        setupNetworkAndConfig(build, sys, builder, *env.network, *config, calibrator, err, sparseWeights),
         "Network And Config setup failed", false, err);
 
-    std::unique_ptr<nvinfer1::ITimingCache> timingCache{nullptr};
+    std::unique_ptr<ITimingCache> timingCache{};
     // Try to load cache from file. Create a fresh cache if the file doesn't exist
     if (build.timingCacheMode == TimingCacheMode::kGLOBAL)
     {
-        std::vector<char> loadedCache = loadTimingCacheFile(build.timingCacheFile);
-        timingCache.reset(config->createTimingCache(static_cast<const void*>(loadedCache.data()), loadedCache.size()));
-        SMP_RETVAL_IF_FALSE(timingCache != nullptr, "TimingCache creation failed", false, err);
-        config->setTimingCache(*timingCache, false);
+        timingCache
+            = samplesCommon::buildTimingCacheFromFile(gLogger.getTRTLogger(), *config, build.timingCacheFile, err);
     }
 
     // CUDA stream used for profiling by the builder.
@@ -1095,41 +1180,22 @@ bool networkToEngine(const BuildOptions& build, const SystemOptions& sys, nvinfe
     SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", false, err);
     config->setProfileStream(*profileStream);
 
-    TrtUniquePtr<nvinfer1::IHostMemory> serializedEngine{builder.buildSerializedNetwork(*env.network, *config)};
+    auto const tBegin = std::chrono::high_resolution_clock::now();
+    std::unique_ptr<IHostMemory> serializedEngine{builder.buildSerializedNetwork(*env.network, *config)};
     SMP_RETVAL_IF_FALSE(serializedEngine != nullptr, "Engine could not be created from network", false, err);
+    auto const tEnd = std::chrono::high_resolution_clock::now();
+    float const buildTime = std::chrono::duration<float>(tEnd - tBegin).count();
+    sample::gLogInfo << "Engine built in " << buildTime << " sec." << std::endl;
+    sample::gLogInfo << "Created engine with size: " << (serializedEngine->size() / 1.0_MiB) << " MiB" << std::endl;
 
-    env.engineBlob.resize(serializedEngine->size());
-    std::memcpy(env.engineBlob.data(), serializedEngine->data(), serializedEngine->size());
-
-    if (build.safe)
-    {
-        ASSERT(sample::hasSafeRuntime());
-        std::unique_ptr<nvinfer1::safe::IRuntime> safeRuntime{sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())};
-        SMP_RETVAL_IF_FALSE(safeRuntime != nullptr, "SafeRuntime creation failed", false, err);
-        safeRuntime->setErrorRecorder(&gRecorder);
-        env.safeEngine.reset(safeRuntime->deserializeCudaEngine(serializedEngine->data(), serializedEngine->size()));
-        if (build.consistency)
-            checkSafeEngine(serializedEngine->data(), serializedEngine->size());
+    env.engine.setBlob(serializedEngine);
 
-        SMP_RETVAL_IF_FALSE(env.safeEngine != nullptr, "SafeEngine deserialization failed", false, err);
-    }
-    else
+    if (build.timingCacheMode == TimingCacheMode::kGLOBAL)
     {
-        TrtUniquePtr<nvinfer1::IRuntime> runtime{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())};
-        SMP_RETVAL_IF_FALSE(runtime != nullptr, "Runtime creation failed", false, err);
-        runtime->setErrorRecorder(&gRecorder);
-        env.engine.reset(runtime->deserializeCudaEngine(serializedEngine->data(), serializedEngine->size()));
-        SMP_RETVAL_IF_FALSE(env.engine != nullptr, "Engine deserialization failed", false, err);
-        if (build.timingCacheMode == TimingCacheMode::kGLOBAL)
-        {
-            auto const& timingCache = config->getTimingCache();
-            std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timingCache->serialize()};
-            SMP_RETVAL_IF_FALSE(timingCacheHostData != nullptr, "Timing Cache serialization failed", false, err);
-            saveTimingCacheFile(build.timingCacheFile, timingCacheHostData.get());
-        }
-        if (config->getInt8Calibrator())
-            delete config->getInt8Calibrator();
+        auto timingCache = config->getTimingCache();
+        samplesCommon::updateTimingCacheFile(gLogger.getTRTLogger(), build.timingCacheFile, timingCache, builder);
     }
+
     return true;
 }
 
@@ -1137,24 +1203,67 @@ bool networkToEngine(const BuildOptions& build, const SystemOptions& sys, nvinfe
 //! \brief Parse a given model, create a network and an engine.
 //!
 bool modelToBuildEnv(
-    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, BuildEnvironment& env, std::ostream& err)
+    ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err)
 {
-    TrtUniquePtr<nvinfer1::IBuilder> builder{nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())};
-    SMP_RETVAL_IF_FALSE(builder != nullptr, "Builder creation failed", false, err);
-    builder->setErrorRecorder(&gRecorder);
-    auto networkFlags = (build.maxBatch) ? 0U : 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    env.builder.reset(createBuilder());
+    SMP_RETVAL_IF_FALSE(env.builder != nullptr, "Builder creation failed", false, err);
+    env.builder->setErrorRecorder(&gRecorder);
+    auto networkFlags = (build.stronglyTyped)
+        ? 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED)
+        : 0U;
+#if !TRT_WINML
+    for (auto const& pluginPath : sys.dynamicPlugins)
+    {
+        env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str());
+    }
+#endif
+    env.network.reset(env.builder->createNetworkV2(networkFlags));
 
-    env.network.reset(builder->createNetworkV2(networkFlags));
+    std::vector<std::string> vcPluginLibrariesUsed;
     SMP_RETVAL_IF_FALSE(env.network != nullptr, "Network creation failed", false, err);
-    env.parser = modelToNetwork(model, *env.network, err);
+    env.parser
+        = modelToNetwork(model, build, *env.network, err, build.versionCompatible ? &vcPluginLibrariesUsed : nullptr);
     SMP_RETVAL_IF_FALSE(env.parser.operator bool(), "Parsing model failed", false, err);
-    SMP_RETVAL_IF_FALSE(networkToEngine(build, sys, *builder, env, err), "Building engine failed", false, err);
+
+#if !TRT_WINML
+    if (build.versionCompatible && !sys.ignoreParsedPluginLibs && !vcPluginLibrariesUsed.empty())
+    {
+        sample::gLogInfo << "The following plugin libraries were identified by the parser as required for a "
+                            "version-compatible engine:"
+                         << std::endl;
+        for (auto const& lib : vcPluginLibrariesUsed)
+        {
+            sample::gLogInfo << "    " << lib << std::endl;
+        }
+        if (!build.excludeLeanRuntime)
+        {
+            sample::gLogInfo << "These libraries will be added to --setPluginsToSerialize since --excludeLeanRuntime "
+                                "was not specified."
+                             << std::endl;
+            std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(),
+                std::back_inserter(sys.setPluginsToSerialize));
+        }
+        sample::gLogInfo << "These libraries will be added to --dynamicPlugins for use at inference time." << std::endl;
+        std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), std::back_inserter(sys.dynamicPlugins));
+
+        // Implicitly-added plugins from ONNX parser should be loaded into plugin registry as well.
+        for (auto const& pluginPath : vcPluginLibrariesUsed)
+        {
+            env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str());
+        }
+
+        sample::gLogInfo << "Use --ignoreParsedPluginLibs to disable this behavior." << std::endl;
+    }
+#endif
+
+    SMP_RETVAL_IF_FALSE(
+        networkToSerializedEngine(build, sys, *env.builder, env, err), "Building engine failed", false, err);
     return true;
 }
 
 namespace
 {
-std::pair<std::vector<std::string>, std::vector<nvinfer1::WeightsRole>> getLayerWeightsRolePair(nvinfer1::IRefitter& refitter)
+std::pair<std::vector<std::string>, std::vector<WeightsRole>> getLayerWeightsRolePair(IRefitter& refitter)
 {
     // Get number of refittable items.
     auto const nbAll = refitter.getAll(0, nullptr, nullptr);
@@ -1165,69 +1274,103 @@ std::pair<std::vector<std::string>, std::vector<nvinfer1::WeightsRole>> getLayer
     std::vector<std::string> layerNameStrs(nbAll);
     std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) {
         if (name == nullptr)
+        {
             return std::string{};
-
+        }
         return std::string{name};
     });
     return {layerNameStrs, weightsRoles};
 }
 
-std::pair<std::vector<std::string>, std::vector<nvinfer1::WeightsRole>> getMissingLayerWeightsRolePair(nvinfer1::IRefitter& refitter)
+std::pair<std::vector<std::string>, std::vector<WeightsRole>> getMissingLayerWeightsRolePair(IRefitter& refitter)
 {
     // Get number of refittable items.
     auto const nbMissing = refitter.getMissing(0, nullptr, nullptr);
-    std::vector<const char*> layerNames(nbMissing);
+    std::vector<char const*> layerNames(nbMissing);
     // Allocate buffers for the items and get them.
     std::vector<nvinfer1::WeightsRole> weightsRoles(nbMissing);
     refitter.getMissing(nbMissing, layerNames.data(), weightsRoles.data());
     std::vector<std::string> layerNameStrs(nbMissing);
     std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) {
         if (name == nullptr)
+        {
             return std::string{};
+        }
         return std::string{name};
     });
     return {layerNameStrs, weightsRoles};
 }
+} // namespace
+
+bool loadStreamingEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err)
+{
+    auto& reader = env.engine.getFileReader();
+    SMP_RETVAL_IF_FALSE(reader.open(filepath), "", false, err << "Error opening engine file: " << filepath);
+    return true;
+}
 
-bool loadEngineToEnv(const std::string& engine, int DLACore, bool safe, bool enableConsistency, BuildEnvironment& env, std::ostream& err)
+bool loadEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err)
 {
-    std::ifstream engineFile(engine, std::ios::binary);
-    SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error opening engine file: " << engine);
+    auto const tBegin = std::chrono::high_resolution_clock::now();
+    std::ifstream engineFile(filepath, std::ios::binary);
+    SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error opening engine file: " << filepath);
     engineFile.seekg(0, std::ifstream::end);
     int64_t fsize = engineFile.tellg();
     engineFile.seekg(0, std::ifstream::beg);
 
-    env.engineBlob.resize(fsize);
-    engineFile.read(reinterpret_cast<char*>(env.engineBlob.data()), fsize);
-    SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error loading engine file: " << engine);
+    std::vector<uint8_t> engineBlob(fsize);
+    engineFile.read(reinterpret_cast<char*>(engineBlob.data()), fsize);
+    SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error loading engine file: " << filepath);
+    auto const tEnd = std::chrono::high_resolution_clock::now();
+    float const loadTime = std::chrono::duration<float>(tEnd - tBegin).count();
+    sample::gLogInfo << "Engine loaded in " << loadTime << " sec." << std::endl;
+    sample::gLogInfo << "Loaded engine with size: " << (fsize / 1.0_MiB) << " MiB" << std::endl;
+
+    env.engine.setBlob(std::move(engineBlob));
+
+    return true;
+}
+
+bool printPlanVersion(BuildEnvironment& env, std::ostream& err)
+{
+    constexpr int64_t kPLAN_SIZE{28};
+    std::vector<uint8_t> data(kPLAN_SIZE);
+    auto blob = data.data();
 
-    if (safe)
+    auto& reader = env.engine.getFileReader();
+    if (reader.isOpen())
     {
-        ASSERT(sample::hasSafeRuntime());
-        std::unique_ptr<nvinfer1::safe::IRuntime> safeRuntime{sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())};
-        safeRuntime->setErrorRecorder(&gRecorder);
-        env.safeEngine.reset(safeRuntime->deserializeCudaEngine(env.engineBlob.data(), fsize));
-        bool result = env.safeEngine != nullptr;
-        if (result && enableConsistency)
-        {
-            checkSafeEngine(env.engineBlob.data(), fsize);
-        }
-        return result;
+        SMP_RETVAL_IF_FALSE(reader.read(data.data(), kPLAN_SIZE) == kPLAN_SIZE, "Failed to read plan file", false, err);
     }
+    else
+    {
+        SMP_RETVAL_IF_FALSE(env.engine.getBlob().data != nullptr, "Plan file is empty", false, err);
+        SMP_RETVAL_IF_FALSE(env.engine.getBlob().size >= 28, "Plan file is incorrect", false, err);
+        blob = static_cast<uint8_t*>(env.engine.getBlob().data);
+    }
+    auto blob32 = reinterpret_cast<uint32_t*>(blob);
 
-    TrtUniquePtr<nvinfer1::IRuntime> runtime{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())};
-    if (DLACore != -1)
-        runtime->setDLACore(DLACore);
-
-    runtime->setErrorRecorder(&gRecorder);
-    env.engine.reset(runtime->deserializeCudaEngine(env.engineBlob.data(), fsize));
-    return env.engine != nullptr;
+    //! Correct TensorRT plan file starts with this tag
+    constexpr uint32_t kPLAN_FILE_TAG{0x74727466U};
+    SMP_RETVAL_IF_FALSE(blob32[0] == kPLAN_FILE_TAG, "Failed to verify a plan tag.", false, err);
+    switch (blob32[1])
+    {
+    case 0U:
+    {
+        // Blob index to store the plan version may depend on the serialization version.
+        sample::gLogInfo << "Plan was created with TensorRT version " << static_cast<int32_t>(blob[24])
+        << "." << static_cast<int32_t>(blob[25]) << "." << static_cast<int32_t>(blob[26])
+        << "." << static_cast<int32_t>(blob[27]) << std::endl;
+        return true;
+    }
+    }
+    sample::gLogError << "Serialization version is not supported." << std::endl;
+    return false;
 }
-} // namespace
 
 void dumpRefittable(nvinfer1::ICudaEngine& engine)
 {
-    TrtUniquePtr<nvinfer1::IRefitter> refitter{nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger())};
+    std::unique_ptr<IRefitter> refitter{createRefitter(engine)};
     if (refitter == nullptr)
     {
         sample::gLogError << "Failed to create a refitter." << std::endl;
@@ -1244,13 +1387,13 @@ void dumpRefittable(nvinfer1::ICudaEngine& engine)
     }
 }
 
-nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err)
+ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err)
 {
-    BuildEnvironment env;
-    return loadEngineToEnv(engine, DLACore, false, false, env, err) ? env.engine.release() : nullptr;
+    BuildEnvironment env(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults());
+    return loadEngineToBuildEnv(engine, env, err) ? env.engine.release() : nullptr;
 }
 
-bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err)
+bool saveEngine(const ICudaEngine& engine, std::string const& fileName, std::ostream& err)
 {
     std::ofstream engineFile(fileName, std::ios::binary);
     if (!engineFile)
@@ -1259,7 +1402,7 @@ bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName
         return false;
     }
 
-    TrtUniquePtr<nvinfer1::IHostMemory> serializedEngine{engine.serialize()};
+    std::unique_ptr<IHostMemory> serializedEngine{engine.serialize()};
     if (serializedEngine == nullptr)
     {
         err << "Engine serialization failed" << std::endl;
@@ -1270,153 +1413,151 @@ bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName
     return !engineFile.fail();
 }
 
-bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys,
-    BuildEnvironment& env, std::ostream& err)
+bool getEngineBuildEnv(
+    const ModelOptions& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err)
 {
-    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
-    TrtUniquePtr<nvinfer1::INetworkDefinition> network;
-    Parser parser;
-
-    bool createEngineSuccess {false};
+    bool createEngineSuccess{false};
 
     if (build.load)
-        createEngineSuccess = loadEngineToEnv(build.engine, sys.DLACore, build.safe, build.consistency, env, err);
+    {
+        if (build.safe)
+        {
+            createEngineSuccess = loadEngineToBuildEnv(build.engine, env, err);
+        }
+        else
+        {
+            createEngineSuccess = loadStreamingEngineToBuildEnv(build.engine, env, err);
+        }
+    }
     else
+    {
         createEngineSuccess = modelToBuildEnv(model, build, sys, env, err);
+    }
 
-    SMP_RETVAL_IF_FALSE(createEngineSuccess, "Failed to create engine from model.", false, err);
+    SMP_RETVAL_IF_FALSE(createEngineSuccess, "Failed to create engine from model or file.", false, err);
+
+    if (build.getPlanVersionOnly && build.load)
+    {
+        SMP_RETVAL_IF_FALSE(printPlanVersion(env, err), "Failed to get plan file version.", false, err);
+        return true;
+    }
 
     if (build.save)
     {
         std::ofstream engineFile(build.engine, std::ios::binary);
-        engineFile.write(reinterpret_cast<char*>(env.engineBlob.data()), env.engineBlob.size());
+        auto& engineBlob = env.engine.getBlob();
+        engineFile.write(static_cast<char const*>(engineBlob.data), engineBlob.size);
         SMP_RETVAL_IF_FALSE(!engineFile.fail(), "Saving engine to file failed.", false, err);
+        engineFile.flush();
+        engineFile.close();
+        if (!build.safe)
+        {
+            env.engine.releaseBlob();
+            SMP_RETVAL_IF_FALSE(loadStreamingEngineToBuildEnv(build.engine, env, err), "Reading engine file failed.", false, err);
+        }
     }
-    return true;
-}
 
-nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder,
-    nvinfer1::INetworkDefinition& network, std::ostream& err)
-{
-    TrtUniquePtr<nvinfer1::IBuilderConfig> config{builder.createBuilderConfig()};
-    std::vector<std::vector<char>> sparseWeights;
-    SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", nullptr, err);
-    SMP_RETVAL_IF_FALSE(setupNetworkAndConfig(build, sys, builder, network, *config, err, sparseWeights),
-        "Network And Config setup failed", nullptr, err);
-    return builder.buildSerializedNetwork(network, *config);
-}
-
-nvinfer1::IHostMemory* modelToSerialized(
-    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err)
-{
-    TrtUniquePtr<nvinfer1::IBuilder> builder{nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())};
-    SMP_RETVAL_IF_FALSE(builder != nullptr, "Builder creation failed", nullptr, err);
-    builder->setErrorRecorder(&gRecorder);
-
-    auto networkFlags
-        = (build.maxBatch) ? 0U : 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-
-    TrtUniquePtr<nvinfer1::INetworkDefinition> network{builder->createNetworkV2(networkFlags)};
-    SMP_RETVAL_IF_FALSE(network != nullptr, "Network creation failed", nullptr, err);
-
-    Parser parser = modelToNetwork(model, *network, err);
-    SMP_RETVAL_IF_FALSE(parser.operator bool(), "Parsing model failed", nullptr, err);
-
-    return networkToSerialized(build, sys, *builder, *network, err);
-}
-
-bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err)
-{
-    TrtUniquePtr<nvinfer1::IHostMemory> serialized{modelToSerialized(model, build, sys, err)};
-    SMP_RETVAL_IF_FALSE(serialized != nullptr, "Network serialization failed", false, err);
-
-    std::ofstream engineFile(build.engine, std::ios::binary);
-    SMP_RETVAL_IF_FALSE(!!engineFile, "Cannot open a file to save a serialize network", false, err);
-    engineFile.write(static_cast<char*>(serialized->data()), serialized->size());
-    return !engineFile.fail();
+    return true;
 }
 
 // There is not a getWeightsName API, so we need to use WeightsRole.
-std::vector<std::pair<nvinfer1::WeightsRole, nvinfer1::Weights>> getAllRefitWeightsForLayer(const nvinfer1::ILayer& l)
+std::vector<std::pair<WeightsRole, Weights>> getAllRefitWeightsForLayer(const ILayer& l)
 {
     switch (l.getType())
     {
-    case nvinfer1::LayerType::kCONSTANT:
-    {
-        const auto& layer = static_cast<const nvinfer1::IConstantLayer&>(l);
-        return {std::make_pair(nvinfer1::WeightsRole::kCONSTANT, layer.getWeights())};
-    }
-    case nvinfer1::LayerType::kCONVOLUTION:
-    {
-        const auto& layer = static_cast<const nvinfer1::IConvolutionLayer&>(l);
-        return {std::make_pair(nvinfer1::WeightsRole::kKERNEL, layer.getKernelWeights()),
-            std::make_pair(nvinfer1::WeightsRole::kBIAS, layer.getBiasWeights())};
-    }
-    case nvinfer1::LayerType::kDECONVOLUTION:
-    {
-        const auto& layer = static_cast<const nvinfer1::IDeconvolutionLayer&>(l);
-        return {std::make_pair(nvinfer1::WeightsRole::kKERNEL, layer.getKernelWeights()),
-            std::make_pair(nvinfer1::WeightsRole::kBIAS, layer.getBiasWeights())};
-    }
-    case nvinfer1::LayerType::kFULLY_CONNECTED:
-    {
-        const auto& layer = static_cast<const nvinfer1::IFullyConnectedLayer&>(l);
-        return {std::make_pair(nvinfer1::WeightsRole::kKERNEL, layer.getKernelWeights()),
-            std::make_pair(nvinfer1::WeightsRole::kBIAS, layer.getBiasWeights())};
-    }
-    case nvinfer1::LayerType::kSCALE:
-    {
-        const auto& layer = static_cast<const nvinfer1::IScaleLayer&>(l);
-        return {std::make_pair(nvinfer1::WeightsRole::kSCALE, layer.getScale()),
-            std::make_pair(nvinfer1::WeightsRole::kSHIFT, layer.getShift())};
-    }
-    case nvinfer1::LayerType::kRNN_V2:
-    case nvinfer1::LayerType::kACTIVATION:
-    case nvinfer1::LayerType::kPOOLING:
-    case nvinfer1::LayerType::kLRN:
-    case nvinfer1::LayerType::kSOFTMAX:
-    case nvinfer1::LayerType::kSHUFFLE:
-    case nvinfer1::LayerType::kCONCATENATION:
-    case nvinfer1::LayerType::kELEMENTWISE:
-    case nvinfer1::LayerType::kPLUGIN:
-    case nvinfer1::LayerType::kUNARY:
-    case nvinfer1::LayerType::kPADDING:
-    case nvinfer1::LayerType::kREDUCE:
-    case nvinfer1::LayerType::kTOPK:
-    case nvinfer1::LayerType::kGATHER:
-    case nvinfer1::LayerType::kMATRIX_MULTIPLY:
-    case nvinfer1::LayerType::kRAGGED_SOFTMAX:
-    case nvinfer1::LayerType::kIDENTITY:
-    case nvinfer1::LayerType::kPLUGIN_V2:
-    case nvinfer1::LayerType::kSLICE:
-    case nvinfer1::LayerType::kFILL:
-    case nvinfer1::LayerType::kSHAPE:
-    case nvinfer1::LayerType::kPARAMETRIC_RELU:
-    case nvinfer1::LayerType::kRESIZE:
-    case nvinfer1::LayerType::kTRIP_LIMIT:
-    case nvinfer1::LayerType::kRECURRENCE:
-    case nvinfer1::LayerType::kITERATOR:
-    case nvinfer1::LayerType::kLOOP_OUTPUT:
-    case nvinfer1::LayerType::kSELECT:
-    case nvinfer1::LayerType::kQUANTIZE:
-    case nvinfer1::LayerType::kDEQUANTIZE:
-    case nvinfer1::LayerType::kCONDITION:
-    case nvinfer1::LayerType::kCONDITIONAL_INPUT:
-    case nvinfer1::LayerType::kCONDITIONAL_OUTPUT:
-    case nvinfer1::LayerType::kSCATTER:
-    case nvinfer1::LayerType::kEINSUM:
-    case nvinfer1::LayerType::kASSERTION: return {};
+    case LayerType::kCONSTANT:
+    {
+        auto const& layer = static_cast<const nvinfer1::IConstantLayer&>(l);
+        auto const weights = layer.getWeights();
+        switch (weights.type)
+        {
+        case DataType::kFLOAT:
+        case DataType::kHALF:
+        case DataType::kBF16:
+        case DataType::kINT8:
+        case DataType::kINT32:
+        case DataType::kINT64: return {std::make_pair(WeightsRole::kCONSTANT, weights)};
+        case DataType::kBOOL:
+        case DataType::kUINT8:
+        case DataType::kFP8:
+        case DataType::kINT4:
+            // Refit not supported for these types.
+            break;
+        }
+        break;
+    }
+    case LayerType::kCONVOLUTION:
+    {
+        auto const& layer = static_cast<const nvinfer1::IConvolutionLayer&>(l);
+        return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()),
+            std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())};
+    }
+    case LayerType::kDECONVOLUTION:
+    {
+        auto const& layer = static_cast<const nvinfer1::IDeconvolutionLayer&>(l);
+        return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()),
+            std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())};
+    }
+    case LayerType::kSCALE:
+    {
+        auto const& layer = static_cast<const nvinfer1::IScaleLayer&>(l);
+        return {std::make_pair(WeightsRole::kSCALE, layer.getScale()),
+            std::make_pair(WeightsRole::kSHIFT, layer.getShift())};
+    }
+    case LayerType::kACTIVATION:
+    case LayerType::kASSERTION:
+    case LayerType::kCAST:
+    case LayerType::kCONCATENATION:
+    case LayerType::kCONDITION:
+    case LayerType::kCONDITIONAL_INPUT:
+    case LayerType::kCONDITIONAL_OUTPUT:
+    case LayerType::kDEQUANTIZE:
+    case LayerType::kEINSUM:
+    case LayerType::kELEMENTWISE:
+    case LayerType::kFILL:
+    case LayerType::kGATHER:
+    case LayerType::kGRID_SAMPLE:
+    case LayerType::kIDENTITY:
+    case LayerType::kITERATOR:
+    case LayerType::kLOOP_OUTPUT:
+    case LayerType::kLRN:
+    case LayerType::kMATRIX_MULTIPLY:
+    case LayerType::kNMS:
+    case LayerType::kNON_ZERO:
+    case LayerType::kNORMALIZATION:
+    case LayerType::kONE_HOT:
+    case LayerType::kPADDING:
+    case LayerType::kPARAMETRIC_RELU:
+    case LayerType::kPLUGIN:
+    case LayerType::kPLUGIN_V2:
+    case LayerType::kPLUGIN_V3:
+    case LayerType::kPOOLING:
+    case LayerType::kQUANTIZE:
+    case LayerType::kRAGGED_SOFTMAX:
+    case LayerType::kRECURRENCE:
+    case LayerType::kREDUCE:
+    case LayerType::kRESIZE:
+    case LayerType::kREVERSE_SEQUENCE:
+    case LayerType::kSCATTER:
+    case LayerType::kSELECT:
+    case LayerType::kSHAPE:
+    case LayerType::kSHUFFLE:
+    case LayerType::kSLICE:
+    case LayerType::kSOFTMAX:
+    case LayerType::kTOPK:
+    case LayerType::kTRIP_LIMIT:
+    case LayerType::kUNARY: return {};
     }
     return {};
 }
 
-bool timeRefit(nvinfer1::INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, bool multiThreading)
+bool timeRefit(INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, bool multiThreading)
 {
     using time_point = std::chrono::time_point<std::chrono::steady_clock>;
     using durationMs = std::chrono::duration<float, std::milli>;
 
     auto const nbLayers = network.getNbLayers();
-    TrtUniquePtr<nvinfer1::IRefitter> refitter{nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger())};
+    std::unique_ptr<IRefitter> refitter{createRefitter(engine)};
     // Set max threads that can be used by refitter.
     if (multiThreading && !refitter->setMaxThreads(10))
     {
@@ -1424,17 +1565,17 @@ bool timeRefit(nvinfer1::INetworkDefinition const& network, nvinfer1::ICudaEngin
         return false;
     }
     auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter);
-    // We use std::string instead of const char* since we can have copies of layer names.
-    std::set<std::pair<std::string, nvinfer1::WeightsRole>> layerRoleSet;
+    // We use std::string instead of char const* since we can have copies of layer names.
+    std::set<std::pair<std::string, WeightsRole>> layerRoleSet;
 
     auto const& layerNames = layerWeightsRolePair.first;
     auto const& weightsRoles = layerWeightsRolePair.second;
 
     std::transform(layerNames.begin(), layerNames.end(), weightsRoles.begin(),
         std::inserter(layerRoleSet, layerRoleSet.begin()),
-        [](std::string const& layerName, nvinfer1::WeightsRole const role) { return std::make_pair(layerName, role); });
+        [](std::string const& layerName, WeightsRole const role) { return std::make_pair(layerName, role); });
 
-    auto const isRefittable = [&layerRoleSet](char const* layerName, nvinfer1::WeightsRole const role) {
+    auto const isRefittable = [&layerRoleSet](char const* layerName, WeightsRole const role) {
         return layerRoleSet.find(std::make_pair(layerName, role)) != layerRoleSet.end();
     };
 
@@ -1449,7 +1590,9 @@ bool timeRefit(nvinfer1::INetworkDefinition const& network, nvinfer1::ICudaEngin
                 {
                     bool const success = refitter->setWeights(layer->getName(), roleWeights.first, roleWeights.second);
                     if (!success)
+                    {
                         return false;
+                    }
                 }
             }
         }
@@ -1468,29 +1611,35 @@ bool timeRefit(nvinfer1::INetworkDefinition const& network, nvinfer1::ICudaEngin
         return layerNames.empty();
     };
 
+    // Skip weights validation since we are confident that the new weights are similar to the weights used to build
+    // engine.
+    refitter->setWeightsValidation(false);
+
     // Warm up and report missing weights
+    // We only need to set weights for the first time and that can be reused in later refitting process.
     bool const success = setWeights() && reportMissingWeights() && refitter->refitCudaEngine();
     if (!success)
     {
         return false;
     }
 
-    constexpr int32_t loop = 10;
+    TrtCudaStream stream;
+    constexpr int32_t kLOOP = 10;
     time_point const refitStartTime{std::chrono::steady_clock::now()};
     {
-        for (int32_t l = 0; l < loop; l++)
+        for (int32_t l = 0; l < kLOOP; l++)
         {
-            bool const success = setWeights() && refitter->refitCudaEngine();
-            if (!success)
+            if (!refitter->refitCudaEngineAsync(stream.get()))
             {
                 return false;
             }
         }
     }
+    stream.synchronize();
     time_point const refitEndTime{std::chrono::steady_clock::now()};
 
     sample::gLogInfo << "Engine refitted"
-        << " in " << durationMs(refitEndTime - refitStartTime).count() / loop << " ms." << std::endl;
+                     << " in " << durationMs(refitEndTime - refitStartTime).count() / kLOOP << " ms." << std::endl;
     return true;
 }
 
@@ -1499,28 +1648,20 @@ namespace
 void* initSafeRuntime()
 {
     void* handle{nullptr};
+    // libsafe_executor.so will be renamed to libnvinfer_safe.so when TRTS-9421 completes.
+    // Currently libsafe_executor_debug.so for samplesCommon::isDebug() is not ready.
+#define TRTS_9421_COMPLETED 0
+#if TRTS_9421_COMPLETED
 #if !defined(_WIN32)
-    std::string const dllName{samplesCommon::isDebug() ? "libnvinfer_safe_debug.so.8" : "libnvinfer_safe.so.8"};
+    std::string const dllName{"libsafe_executor.so"};
 #if SANITIZER_BUILD
     handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE);
 #else
-    handle = dlopen(dllName.c_str(), RTLD_LAZY);
-#endif
-#endif
-    return handle;
-}
-
-void* initConsistencyCheckerLibrary()
-{
-    void* handle{nullptr};
-#if !defined(_WIN32)
-    std::string const dllName{samplesCommon::isDebug() ? "libnvinfer_checker_debug.so.8" : "libnvinfer_checker.so.8"};
-#if SANITIZER_BUILD
-    handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE);
-#else
-    handle = dlopen(dllName.c_str(), RTLD_LAZY);
+    // RTLD_GLOBAL is used for symbol resolution of subsequently loaded plugin libraries
+    handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_GLOBAL);
 #endif
 #endif
+#endif // TRTS_9421_COMPLETED
     return handle;
 }
 
@@ -1536,7 +1677,6 @@ struct DllDeleter
     }
 };
 const std::unique_ptr<void, DllDeleter> safeRuntimeLibrary{initSafeRuntime()};
-const std::unique_ptr<void, DllDeleter> consistencyCheckerLibrary{initConsistencyCheckerLibrary()};
 #endif
 } // namespace
 
@@ -1549,81 +1689,4 @@ bool hasSafeRuntime()
     return ret;
 }
 
-nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept
-{
-    nvinfer1::safe::IRuntime* runtime{nullptr};
-#if !defined(_WIN32)
-    constexpr char symbolName[] = "_ZN8nvinfer14safe18createInferRuntimeERNS_7ILoggerE";
-    typedef nvinfer1::safe::IRuntime* (*CreateInferRuntimeFn)(nvinfer1::ILogger & logger);
-    if (hasSafeRuntime())
-    {
-        auto createFn = reinterpret_cast<CreateInferRuntimeFn>(dlsym(safeRuntimeLibrary.get(), symbolName));
-        if (createFn != nullptr)
-        {
-            runtime = createFn(logger);
-        }
-    }
-#endif
-    return runtime;
-}
-
-bool hasConsistencyChecker()
-{
-    bool ret{false};
-#if !defined(_WIN32)
-    ret = (consistencyCheckerLibrary != nullptr);
-#endif
-    return ret;
-}
-
-nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker(
-    nvinfer1::ILogger& logger, void const* serializedEngine, int32_t const engineSize) noexcept
-{
-    nvinfer1::consistency::IConsistencyChecker* checker{nullptr};
-
-    if (serializedEngine == nullptr || engineSize == 0)
-    {
-        return checker;
-    }
-
-#if !defined(_WIN32)
-    constexpr char symbolName[] = "createConsistencyChecker_INTERNAL";
-    typedef nvinfer1::consistency::IConsistencyChecker* (*CreateCheckerFn)(
-        nvinfer1::ILogger * logger, void const* data, size_t size, uint32_t version);
-    if (hasSafeRuntime())
-    {
-        auto createFn = reinterpret_cast<CreateCheckerFn>(dlsym(consistencyCheckerLibrary.get(), symbolName));
-        if (createFn != nullptr)
-        {
-            checker = createFn(&logger, serializedEngine, engineSize, NV_TENSORRT_VERSION);
-        }
-    }
-#endif
-    return checker;
-}
-
-bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize)
-{
-
-    if (!hasConsistencyChecker())
-    {
-        sample::gLogError << "Cannot perform consistency check because the checker is not loaded.." << std::endl;
-        return false;
-    }
-    auto checker = std::unique_ptr<nvinfer1::consistency::IConsistencyChecker>(
-        createConsistencyChecker(sample::gLogger.getTRTLogger(), serializedEngine, engineSize));
-    if (checker.get() == nullptr)
-    {
-        sample::gLogError << "Failed to create consistency checker." << std::endl;
-        return false;
-    }
-    sample::gLogInfo << "Start consistency checking." << std::endl;
-    if (!checker->validate())
-    {
-        sample::gLogError << "Consistency validation failed." << std::endl;
-        return false;
-    }
-    sample::gLogInfo << "Consistency validation passed." << std::endl;
-    return true;
-}
 } // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleEngines.h b/src/Detector/tensorrt_yolo/common/sampleEngines.h
index 620b51a1..ec02e909 100644
--- a/src/Detector/tensorrt_yolo/common/sampleEngines.h
+++ b/src/Detector/tensorrt_yolo/common/sampleEngines.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,58 +18,227 @@
 #ifndef TRT_SAMPLE_ENGINES_H
 #define TRT_SAMPLE_ENGINES_H
 
-#include <iostream>
-#include <vector>
-
 #include "NvInfer.h"
-
-#if (NV_TENSORRT_MAJOR > 7)
-
-#include "NvInferConsistency.h"
-#include "NvInferSafeRuntime.h"
-
-#endif
-
 #include "NvOnnxParser.h"
 #include "sampleOptions.h"
 #include "sampleUtils.h"
+#include "streamReader.h"
+#include <iostream>
+#include <vector>
 
 namespace sample
 {
 
 struct Parser
 {
-    TrtUniquePtr<nvonnxparser::IParser> onnxParser;
+    std::unique_ptr<nvonnxparser::IParser> onnxParser;
 
     operator bool() const
     {
-        return onnxParser.operator bool();
+        return onnxParser != nullptr;
     }
 };
 
-struct BuildEnvironment
+//!
+//! \brief Helper struct to faciliate engine serialization and deserialization. It does not own the underlying memory.
+//!
+struct EngineBlob
 {
-    TrtUniquePtr<nvinfer1::INetworkDefinition> network;
-    //! Parser that creates the network. Must be declared *after* network, so that when
-    //! ~BuildEnvironment() executes, the parser is destroyed before the network is destroyed.
-    Parser parser;
-    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
-    std::unique_ptr<nvinfer1::safe::ICudaEngine> safeEngine;
-    std::vector<uint8_t> engineBlob;
+    EngineBlob(void* engineData, size_t engineSize)
+        : data(engineData)
+        , size(engineSize)
+    {
+    }
+    void* data{};
+    size_t size{};
+    bool empty() const
+    {
+        return size == 0;
+    }
 };
 
 //!
-//! \brief Generate a network definition for a given model
-//!
-//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid
-//! parser (the returned parser converts to false if tested)
+//! \brief A helper class to hold a serialized engine (std or safe) and only deserialize it when being accessed.
 //!
-//! Constant input dimensions in the model must not be changed in the corresponding
-//! network definition, because its correctness may rely on the constants.
-//!
-//! \see Parser::operator bool()
-//!
-Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err);
+class LazilyDeserializedEngine
+{
+public:
+    //!
+    //! \brief Delete default constructor to make sure isSafe and DLACore are always set.
+    //!
+    LazilyDeserializedEngine() = delete;
+
+    //!
+    //! \brief Constructor of LazilyDeserializedEngine.
+    //!
+    LazilyDeserializedEngine(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir,
+        nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath)
+        : mIsSafe(isSafe)
+        , mVersionCompatible(versionCompatible)
+        , mDLACore(DLACore)
+        , mTempdir(tempdir)
+        , mTempfileControls(tempfileControls)
+        , mLeanDLLPath(leanDLLPath)
+    {
+        mFileReader = std::make_unique<samplesCommon::FileStreamReader>();
+    }
+
+    //!
+    //! \brief Move from another LazilyDeserializedEngine.
+    //!
+    LazilyDeserializedEngine(LazilyDeserializedEngine&& other) = default;
+
+    //!
+    //! \brief Delete copy constructor.
+    //!
+    LazilyDeserializedEngine(LazilyDeserializedEngine const& other) = delete;
+
+    //!
+    //! \brief Get the pointer to the ICudaEngine. Triggers deserialization if not already done so.
+    //!
+    nvinfer1::ICudaEngine* get();
+
+    //!
+    //! \brief Get the pointer to the ICudaEngine and release the ownership.
+    //!
+    nvinfer1::ICudaEngine* release();
+
+    //!
+    //! \brief Get the underlying blob storing serialized engine.
+    //!
+    EngineBlob const getBlob() const
+    {
+        ASSERT((!mFileReader || !mFileReader->isOpen())
+            && "Attempting to access the glob when there is an open file reader!");
+        if (!mEngineBlob.empty())
+        {
+            return EngineBlob{const_cast<void*>(static_cast<void const*>(mEngineBlob.data())), mEngineBlob.size()};
+        }
+        if (mEngineBlobHostMemory.get() != nullptr && mEngineBlobHostMemory->size() > 0)
+        {
+            return EngineBlob{mEngineBlobHostMemory->data(), mEngineBlobHostMemory->size()};
+        }
+        ASSERT(false && "Attempting to access an empty engine!");
+        return EngineBlob{nullptr, 0};
+    }
+
+    //!
+    //! \brief Set the underlying blob storing the serialized engine without duplicating IHostMemory.
+    //!
+    void setBlob(std::unique_ptr<nvinfer1::IHostMemory>& data)
+    {
+        ASSERT(data.get() && data->size() > 0);
+        mEngineBlobHostMemory = std::move(data);
+        mEngine.reset();
+    }
+
+    //!
+    //! \brief Set the underlying blob storing the serialized engine without duplicating vector memory.
+    //!
+    void setBlob(std::vector<uint8_t>&& engineBlob)
+    {
+        mEngineBlob = std::move(engineBlob);
+        mEngine.reset();
+    }
+
+    //!
+    //! \brief Release the underlying blob without deleting the deserialized engine.
+    //!
+    void releaseBlob()
+    {
+        mEngineBlob.clear();
+        mEngineBlobHostMemory.reset();
+    }
+
+    //!
+    //! \brief Get the file stream reader used for deserialization
+    //!
+    samplesCommon::FileStreamReader& getFileReader()
+    {
+        ASSERT(mFileReader);
+        return *mFileReader;
+    }
+
+    //!
+    //! \brief Get if safe mode is enabled.
+    //!
+    bool isSafe()
+    {
+        return mIsSafe;
+    }
+
+    void setDynamicPlugins(std::vector<std::string> const& dynamicPlugins)
+    {
+        mDynamicPlugins = dynamicPlugins;
+    }
+
+private:
+    bool mIsSafe{false};
+    bool mVersionCompatible{false};
+    int32_t mDLACore{-1};
+    std::vector<uint8_t> mEngineBlob;
+    std::unique_ptr<samplesCommon::FileStreamReader> mFileReader;
+
+    // Directly use the host memory of a serialized engine instead of duplicating the engine in CPU memory.
+    std::unique_ptr<nvinfer1::IHostMemory> mEngineBlobHostMemory;
+
+    std::string mTempdir{};
+    nvinfer1::TempfileControlFlags mTempfileControls{getTempfileControlDefaults()};
+    std::string mLeanDLLPath{};
+    std::vector<std::string> mDynamicPlugins;
+
+    //! \name Owned TensorRT objects
+    //! Per TensorRT object lifetime requirements as outlined in the developer guide,
+    //! the runtime must remain live while any engines created by the runtime are live.
+    //! DO NOT ADJUST the declaration order here: runtime -> (engine).
+    //! Destruction occurs in reverse declaration order: (engine) -> runtime.
+    //!@{
+
+    //! The runtime used to track parent of mRuntime if one exists.
+    //! Needed to load mRuntime if lean.so is supplied through file system path.
+    std::unique_ptr<nvinfer1::IRuntime> mParentRuntime{};
+
+    //! The runtime that is used to deserialize the engine.
+    std::unique_ptr<nvinfer1::IRuntime> mRuntime{};
+
+    //! If mIsSafe is false, this points to the deserialized std engine
+    std::unique_ptr<nvinfer1::ICudaEngine> mEngine{};
+
+    //!@}
+};
+
+struct BuildEnvironment
+{
+    BuildEnvironment() = delete;
+    BuildEnvironment(BuildEnvironment const& other) = delete;
+    BuildEnvironment(BuildEnvironment&& other) = delete;
+    BuildEnvironment(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir,
+        nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath = "")
+        : engine(isSafe, versionCompatible, DLACore, tempdir, tempfileControls, leanDLLPath)
+    {
+    }
+
+    //! \name Owned TensorRT objects
+    //! Per TensorRT object lifetime requirements as outlined in the developer guide,
+    //! factory objects must remain live while the objects created by those factories
+    //! are live (with the exception of builder -> engine).
+    //! DO NOT ADJUST the declaration order here: builder -> network -> parser.
+    //! Destruction occurs in reverse declaration order: parser -> network -> builder.
+    //!@{
+
+    //! The builder used to build the engine.
+    std::unique_ptr<nvinfer1::IBuilder> builder;
+
+    //! The network used by the builder.
+    std::unique_ptr<nvinfer1::INetworkDefinition> network;
+
+    //! The parser used to specify the network.
+    Parser parser;
+
+    //! The engine.
+    LazilyDeserializedEngine engine;
+    //!@}
+};
 
 //!
 //! \brief Set up network and config
@@ -89,95 +259,63 @@ void dumpRefittable(nvinfer1::ICudaEngine& engine);
 //!
 //! \return Pointer to the engine loaded or nullptr if the operation failed
 //!
-nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err);
+nvinfer1::ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err);
 
 //!
 //! \brief Save an engine into a file
 //!
 //! \return boolean Return true if the engine was successfully saved
 //!
-bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err);
+bool saveEngine(nvinfer1::ICudaEngine const& engine, std::string const& fileName, std::ostream& err);
 
 //!
 //! \brief Create an engine from model or serialized file, and optionally save engine
 //!
 //! \return Pointer to the engine created or nullptr if the creation failed
 //!
-bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, 
-    BuildEnvironment& env, std::ostream& err);
-
-//!
-//! \brief Create an engine from model or serialized file, and optionally save engine
-//!
-//! \return Pointer to the engine created or nullptr if the creation failed
-//!
-inline TrtUniquePtr<nvinfer1::ICudaEngine> getEngine(
-    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err)
-{
-    BuildEnvironment env;
-    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
-    if (getEngineBuildEnv(model, build, sys, env, err))
-    {
-        engine.swap(env.engine);
-    }
-    return engine;
-}
+bool getEngineBuildEnv(
+    ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err);
 
 //!
 //! \brief Create a serialized network
 //!
 //! \return Pointer to a host memory for a serialized network
 //!
-nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder,
-    nvinfer1::INetworkDefinition& network, std::ostream& err);
+nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys,
+    nvinfer1::IBuilder& builder, nvinfer1::INetworkDefinition& network, std::ostream& err);
 
 //!
 //! \brief Tranfer model to a serialized network
 //!
 //! \return Pointer to a host memory for a serialized network
 //!
-nvinfer1::IHostMemory* modelToSerialized(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
+nvinfer1::IHostMemory* modelToSerialized(
+    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
 
 //!
 //! \brief Serialize network and save it into a file
 //!
 //! \return boolean Return true if the network was successfully serialized and saved
 //!
-bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
+bool serializeAndSave(
+    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
 
 bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading);
 
 //!
 //! \brief Set tensor scales from a calibration table
 //!
-void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, const std::vector<IOFormat>& inputFormats,
-        const std::vector<IOFormat>& outputFormats, const std::string& calibrationFile);
+void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector<IOFormat> const& inputFormats,
+    std::vector<IOFormat> const& outputFormats, std::string const& calibrationFile);
 
 //!
 //! \brief Check if safe runtime is loaded.
 //!
 bool hasSafeRuntime();
 
-//!
-//! \brief Create a safe runtime object if the dynamic library is loaded.
-//!
-nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept;
-
-//!
-//! \brief Check if consistency checker is loaded.
-//!
-bool hasConsistencyChecker();
+bool loadStreamingEngineToBuildEnv(std::string const& engine, BuildEnvironment& env, std::ostream& err);
 
-//!
-//! \brief Create a consistency checker object if the dynamic library is loaded.
-//!
-nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker(
-    nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept;
-
-//!
-//! \brief Run consistency check on serialized engine.
-//!
-bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize);
+bool loadEngineToBuildEnv(std::string const& engine, BuildEnvironment& env, std::ostream& err);
 } // namespace sample
 
 #endif // TRT_SAMPLE_ENGINES_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h b/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h
new file mode 100644
index 00000000..cc8bf1b9
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/sampleEntrypoints.h
@@ -0,0 +1,101 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_ENTRYPOINTS_H
+#define TRT_SAMPLE_ENTRYPOINTS_H
+
+//! \file sampleEntrypoints.h
+//!
+//! Declares and conditionally defines entrypoints needed to create base TensorRT objects, depending
+//! on whether the given sample uses TRT at link time or dynamically.  Since common code is built once
+//! and shared across all samples (both link-time and dynamic TRT), it does not define these entrypoints,
+//! so each sample must define them individually.
+//!
+//! Samples that use TRT at link time can define DEFINE_TRT_ENTRYPOINTS before including this header to
+//! pick up the definitions here.
+
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
+#include "logger.h"
+
+extern nvinfer1::IBuilder* createBuilder();
+extern nvinfer1::IRuntime* createRuntime();
+extern nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine);
+
+extern nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network);
+
+#if !defined(DEFINE_TRT_ENTRYPOINTS)
+#define DEFINE_TRT_ENTRYPOINTS 0
+#endif
+
+// Allow opting out of individual entrypoints that are unused by the sample
+#if !defined(DEFINE_TRT_BUILDER_ENTRYPOINT)
+#define DEFINE_TRT_BUILDER_ENTRYPOINT 1
+#endif
+#if !defined(DEFINE_TRT_RUNTIME_ENTRYPOINT)
+#define DEFINE_TRT_RUNTIME_ENTRYPOINT 1
+#endif
+#if !defined(DEFINE_TRT_REFITTER_ENTRYPOINT)
+#define DEFINE_TRT_REFITTER_ENTRYPOINT 1
+#endif
+#if !defined(DEFINE_TRT_ONNX_PARSER_ENTRYPOINT)
+#define DEFINE_TRT_ONNX_PARSER_ENTRYPOINT 1
+#endif
+#if !defined(DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT)
+#define DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT 1
+#endif
+
+#if DEFINE_TRT_ENTRYPOINTS
+nvinfer1::IBuilder* createBuilder()
+{
+#if DEFINE_TRT_BUILDER_ENTRYPOINT
+    return nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger());
+#else
+    return {};
+#endif
+}
+
+nvinfer1::IRuntime* createRuntime()
+{
+#if DEFINE_TRT_RUNTIME_ENTRYPOINT
+    return nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger());
+#else
+    return {};
+#endif
+}
+
+nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine)
+{
+#if DEFINE_TRT_REFITTER_ENTRYPOINT
+    return nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger());
+#else
+    return {};
+#endif
+}
+
+nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network)
+{
+#if DEFINE_TRT_ONNX_PARSER_ENTRYPOINT
+    return nvonnxparser::createParser(network, sample::gLogger.getTRTLogger());
+#else
+    return {};
+#endif
+}
+
+#endif // DEFINE_TRT_ENTRYPOINTS
+
+#endif // TRT_SAMPLE_ENTRYPOINTS_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.cpp b/src/Detector/tensorrt_yolo/common/sampleInference.cpp
index 51f16882..ca0098d4 100644
--- a/src/Detector/tensorrt_yolo/common/sampleInference.cpp
+++ b/src/Detector/tensorrt_yolo/common/sampleInference.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -23,6 +24,8 @@
 #include <memory>
 #include <mutex>
 #include <numeric>
+#include <set>
+#include <sstream>
 #include <thread>
 #include <utility>
 #include <vector>
@@ -35,6 +38,7 @@
 #include "NvInfer.h"
 
 #include "ErrorRecorder.h"
+#include "bfloat16.h"
 #include "logger.h"
 #include "sampleDevice.h"
 #include "sampleEngines.h"
@@ -42,22 +46,23 @@
 #include "sampleOptions.h"
 #include "sampleReporting.h"
 #include "sampleUtils.h"
-
+using namespace nvinfer1;
 namespace sample
 {
 
-template <class MapType, class EngineType>
-bool validateTensorNames(
-    const MapType& map, const EngineType* engine, const int32_t endBindingIndex)
+template <class TMapType, class TEngineType>
+bool validateTensorNames(TMapType const& map, TEngineType const* engine, int32_t const endBindingIndex)
 {
     // Check if the provided input tensor names match the input tensors of the engine.
     // Throw an error if the provided input tensor names cannot be found because it implies a potential typo.
-    for (const auto& item : map)
+    for (auto const& item : map)
     {
         bool tensorNameFound{false};
         for (int32_t b = 0; b < endBindingIndex; ++b)
         {
-            if (engine->bindingIsInput(b) && engine->getBindingName(b) == item.first)
+            auto const tensorName = engine->getIOTensorName(b);
+            auto const tensorIOMode = engine->getTensorIOMode(tensorName);
+            if (tensorIOMode == nvinfer1::TensorIOMode::kINPUT && matchStringWithOneWildcard(item.first, tensorName))
             {
                 tensorNameFound = true;
                 break;
@@ -73,74 +78,86 @@ bool validateTensorNames(
     return true;
 }
 
-template <class EngineType, class ContextType>
+template <class TEngineType>
 class FillBindingClosure
 {
 private:
     using InputsMap = std::unordered_map<std::string, std::string>;
     using BindingsVector = std::vector<std::unique_ptr<Bindings>>;
 
-    EngineType const* engine;
-    ContextType const* context;
+    TEngineType const* mEngine;
+    nvinfer1::IExecutionContext const* mContext;
     InputsMap const& inputs;
     BindingsVector& bindings;
     int32_t batch;
     int32_t endBindingIndex;
+    int32_t profileIndex;
 
-    void fillOneBinding(int32_t bindingIndex, int64_t vol)
+    void fillOneBinding(TensorInfo const& tensorInfo)
     {
-        auto const dims = getDims(bindingIndex);
-        auto const name = engine->getBindingName(bindingIndex);
-        auto const isInput = engine->bindingIsInput(bindingIndex);
-        auto const dataType = engine->getBindingDataType(bindingIndex);
-        auto const *bindingInOutStr = isInput ? "input" : "output";
+        auto const name = tensorInfo.name;
+        auto const* bindingInOutStr = tensorInfo.isInput ? "Input" : "Output";
         for (auto& binding : bindings)
         {
-            const auto input = inputs.find(name);
-            if (isInput && input != inputs.end())
+            auto const input = findPlausible(inputs, name);
+            if (tensorInfo.isInput && input != inputs.end())
             {
                 sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl;
-                binding->addBinding(bindingIndex, name, isInput, vol, dataType, input->second);
+                binding->addBinding(tensorInfo, input->second);
+            }
+            else
+            {
+                if (tensorInfo.isInput)
+                {
+                    sample::gLogInfo << "Using random values for input " << name << std::endl;
+                }
+                binding->addBinding(tensorInfo);
+            }
+            if (tensorInfo.isDynamic)
+            {
+                sample::gLogInfo << bindingInOutStr << " binding for " << name
+                                 << " is dynamic and will be created during execution using OutputAllocator."
+                                 << std::endl;
             }
             else
             {
-                sample::gLogInfo << "Using random values for " << bindingInOutStr << " " << name << std::endl;
-                binding->addBinding(bindingIndex, name, isInput, vol, dataType);
+                sample::gLogInfo << bindingInOutStr << " binding for " << name << " with dimensions " << tensorInfo.dims
+                                 << " is created." << std::endl;
             }
-            sample::gLogInfo << "Created " << bindingInOutStr <<" binding for " << name << " with dimensions " << dims << std::endl;
         }
     }
 
     bool fillAllBindings(int32_t batch, int32_t endBindingIndex)
     {
-        if (!validateTensorNames(inputs, engine, endBindingIndex))
+        if (!validateTensorNames(inputs, mEngine, endBindingIndex))
         {
             sample::gLogError << "Invalid tensor names found in --loadInputs flag." << std::endl;
             return false;
         }
-
         for (int32_t b = 0; b < endBindingIndex; b++)
         {
-            auto const dims = getDims(b);
-            auto const comps = engine->getBindingComponentsPerElement(b);
-            auto const strides = context->getStrides(b);
-            int32_t const vectorDimIndex = engine->getBindingVectorizedDim(b);
-            auto const vol = volume(dims, strides, vectorDimIndex, comps, batch);
-            fillOneBinding(b, vol);
+            TensorInfo tensorInfo;
+            tensorInfo.bindingIndex = b;
+            getTensorInfo(tensorInfo);
+            tensorInfo.updateVolume(batch);
+            fillOneBinding(tensorInfo);
         }
         return true;
     }
 
-    nvinfer1::Dims getDims(int32_t bindingIndex);
+    void getTensorInfo(TensorInfo& tensorInfo);
 
 public:
-    FillBindingClosure(EngineType const* _engine, ContextType const* _context, InputsMap const& _inputs, BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex)
-        : engine(_engine)
-        , context(_context)
+    FillBindingClosure(TEngineType const* _engine, nvinfer1::IExecutionContext const* _context,
+        InputsMap const& _inputs, BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex,
+        int32_t _profileIndex)
+        : mEngine(_engine)
+        , mContext(_context)
         , inputs(_inputs)
         , bindings(_bindings)
         , batch(_batch)
         , endBindingIndex(_endBindingIndex)
+        , profileIndex(_profileIndex)
     {
     }
 
@@ -151,172 +168,364 @@ class FillBindingClosure
 };
 
 template <>
-nvinfer1::Dims FillBindingClosure<nvinfer1::ICudaEngine, nvinfer1::IExecutionContext>::getDims(int32_t bindingIndex)
+void FillBindingClosure<nvinfer1::ICudaEngine>::getTensorInfo(TensorInfo& tensorInfo)
 {
-    return context->getBindingDimensions(bindingIndex);
+    auto const b = tensorInfo.bindingIndex;
+    auto const name = mEngine->getIOTensorName(b);
+    tensorInfo.name = name;
+    tensorInfo.dims = mContext->getTensorShape(name);
+    tensorInfo.isDynamic = std::any_of(
+        tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; });
+    tensorInfo.comps = mEngine->getTensorComponentsPerElement(name, profileIndex);
+    tensorInfo.strides = mContext->getTensorStrides(name);
+    tensorInfo.vectorDimIndex = mEngine->getTensorVectorizedDim(name, profileIndex);
+    tensorInfo.isInput = mEngine->getTensorIOMode(name) == TensorIOMode::kINPUT;
+    tensorInfo.dataType = mEngine->getTensorDataType(name);
 }
 
-template <>
-nvinfer1::Dims FillBindingClosure<nvinfer1::safe::ICudaEngine, nvinfer1::safe::IExecutionContext>::getDims(int32_t bindingIndex)
+namespace
 {
-    return engine->getBindingDimensions(bindingIndex);
+bool allocateContextMemory(InferenceEnvironment& iEnv, InferenceOptions const& inference)
+{
+    auto* engine = iEnv.engine.get();
+    iEnv.deviceMemory.resize(inference.infStreams);
+    // Delay context memory allocation until input shapes are specified because runtime allocation would require actual
+    // input shapes.
+    for (int32_t i = 0; i < inference.infStreams; ++i)
+    {
+        auto const& ec = iEnv.contexts.at(i);
+        if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC)
+        {
+            sample::gLogInfo << "Created execution context with device memory size: "
+                             << (engine->getDeviceMemorySize() / 1.0_MiB) << " MiB" << std::endl;
+        }
+        else
+        {
+            size_t sizeToAlloc{0};
+            const char* allocReason{nullptr};
+            if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kPROFILE)
+            {
+                auto const p = inference.optProfileIndex;
+                sizeToAlloc = engine->getDeviceMemorySizeForProfile(p);
+                allocReason = "current profile";
+            }
+            else if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kRUNTIME)
+            {
+                sizeToAlloc = ec->updateDeviceMemorySizeForShapes();
+                allocReason = "current input shapes";
+            }
+            else
+            {
+                sample::gLogError << "Unrecognizable memory allocation strategy." << std::endl;
+                return false;
+            }
+            iEnv.deviceMemory.at(i) = TrtDeviceBuffer(sizeToAlloc);
+            ec->setDeviceMemoryV2(iEnv.deviceMemory.at(i).get(), iEnv.deviceMemory.at(i).getSize());
+            sample::gLogInfo << "Maximum device memory size across all profiles: "
+                             << (engine->getDeviceMemorySizeV2() / 1.0_MiB) << " MiB" << std::endl;
+            sample::gLogInfo << "Only allocated device memory enough for " << allocReason << ": "
+                             << (sizeToAlloc / 1.0_MiB) << " MiB" << std::endl;
+        }
+    }
+    return true;
 }
+} // namespace
 
-bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference)
+bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system)
 {
+#if TRT_WINML
+    int32_t const isIntegrated{};
+#else
     int32_t device{};
     cudaCheck(cudaGetDevice(&device));
 
     cudaDeviceProp properties;
     cudaCheck(cudaGetDeviceProperties(&properties, device));
+    int32_t const isIntegrated{properties.integrated};
+#endif
     // Use managed memory on integrated devices when transfers are skipped
     // and when it is explicitly requested on the commandline.
-    bool useManagedMemory{(inference.skipTransfers && properties.integrated) || inference.useManaged};
-    using FillSafeBindings = FillBindingClosure<nvinfer1::safe::ICudaEngine, nvinfer1::safe::IExecutionContext>;
-    if (iEnv.safe)
+    bool useManagedMemory{(inference.skipTransfers && isIntegrated) || inference.useManaged};
+    SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
+
+    using FillStdBindings = FillBindingClosure<nvinfer1::ICudaEngine>;
+
+    auto* engine = iEnv.engine.get();
+    SMP_RETVAL_IF_FALSE(engine != nullptr, "Got invalid engine!", false, sample::gLogError);
+
+    // Release serialized blob to save memory space.
+    iEnv.engine.releaseBlob();
+
+    // Setup weight streaming if enabled
+    if (engine->getStreamableWeightsSize() > 0)
     {
-        ASSERT(sample::hasSafeRuntime());
-        auto* safeEngine = iEnv.safeEngine.get();
-        for (int32_t s = 0; s < inference.streams; ++s)
+        auto const& budget = inference.weightStreamingBudget;
+        int64_t wsBudget = budget.bytes;
+        if (budget.percent != 100.0)
+        {
+            double const percent = budget.percent;
+            ASSERT(percent < 100.0);
+            auto const max = engine->getStreamableWeightsSize();
+            wsBudget = (max >= 0) ? (percent / 100) * (max) : WeightStreamingBudget::kDISABLE;
+        }
+
+        if (wsBudget == WeightStreamingBudget::kDISABLE)
         {
-            iEnv.safeContext.emplace_back(safeEngine->createExecutionContext());
-            iEnv.bindings.emplace_back(new Bindings(useManagedMemory));
+            wsBudget = engine->getStreamableWeightsSize();
         }
-        const int32_t nBindings = safeEngine->getNbBindings();
-        auto const* safeContext = iEnv.safeContext.front().get();
-        // batch is set to 1 because safety only support explicit batch.
-        return FillSafeBindings(iEnv.safeEngine.get(), safeContext, inference.inputs, iEnv.bindings, 1, nBindings)();
+        else if (wsBudget == WeightStreamingBudget::kAUTOMATIC)
+        {
+            wsBudget = engine->getWeightStreamingAutomaticBudget();
+        }
+        ASSERT(wsBudget >= 0);
+        bool success = engine->setWeightStreamingBudgetV2(wsBudget);
+        SMP_RETVAL_IF_FALSE(success, "Failed to set weight streaming limit!", false, sample::gLogError);
+        switch (wsBudget)
+        {
+        case WeightStreamingBudget::kDISABLE:
+        {
+            sample::gLogInfo << "Weight streaming has been disabled at runtime." << std::endl;
+            break;
+        }
+
+        case WeightStreamingBudget::kAUTOMATIC:
+        {
+            sample::gLogInfo << "The weight streaming budget will automatically be chosen by TensorRT." << std::endl;
+            break;
+        }
+        default:
+        {
+            sample::gLogInfo << "Weight streaming is enabled with a device memory limit of " << wsBudget << " bytes."
+                             << std::endl;
+            break;
+        }
+        }
+    }
+
+    int32_t const nbOptProfiles = engine->getNbOptimizationProfiles();
+
+    if (inference.optProfileIndex >= nbOptProfiles)
+    {
+        sample::gLogError << "Selected profile index " << inference.optProfileIndex
+                          << " exceeds the number of profiles that the engine holds. " << std::endl;
+        return false;
     }
 
-    using FillStdBindings = FillBindingClosure<nvinfer1::ICudaEngine, nvinfer1::IExecutionContext>;
+    if (nbOptProfiles > 1 && !inference.setOptProfile)
+    {
+        sample::gLogWarning << nbOptProfiles
+                            << " profiles detected but not set. Running with profile 0. Please use "
+                               "--dumpOptimizationProfile to see all available profiles."
+                            << std::endl;
+    }
+
+    cudaStream_t setOptProfileStream;
+    CHECK(cudaStreamCreate(&setOptProfileStream));
 
-    for (int32_t s = 0; s < inference.streams; ++s)
+    for (int32_t s = 0; s < inference.infStreams; ++s)
     {
-        auto ec = iEnv.engine->createExecutionContext();
+        IExecutionContext* ec{nullptr};
+        if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC)
+        {
+            // Let TRT pre-allocate and manage the memory.
+            ec = engine->createExecutionContext();
+        }
+        else
+        {
+            // Allocate based on the current profile or runtime shapes.
+            ec = engine->createExecutionContext(ExecutionContextAllocationStrategy::kUSER_MANAGED);
+        }
         if (ec == nullptr)
         {
             sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl;
             return false;
         }
-        iEnv.context.emplace_back(ec);
+        ec->setNvtxVerbosity(inference.nvtxVerbosity);
+
+#if !TRT_WINML
+        int32_t const persistentCacheLimit
+            = samplesCommon::getMaxPersistentCacheSize() * inference.persistentCacheRatio;
+        sample::gLogInfo << "Setting persistentCacheLimit to " << persistentCacheLimit << " bytes." << std::endl;
+        ec->setPersistentCacheLimit(persistentCacheLimit);
+#endif
+
+        auto setProfile = ec->setOptimizationProfileAsync(inference.optProfileIndex, setOptProfileStream);
+        CHECK(cudaStreamSynchronize(setOptProfileStream));
+
+        if (!setProfile)
+        {
+            sample::gLogError << "Set optimization profile failed. " << std::endl;
+            if (inference.infStreams > 1)
+            {
+                sample::gLogError
+                    << "Please ensure that the engine is built with preview feature profileSharing0806 enabled. "
+                    << std::endl;
+            }
+            return false;
+        }
+
+        iEnv.contexts.emplace_back(ec);
         iEnv.bindings.emplace_back(new Bindings(useManagedMemory));
     }
+
+    CHECK(cudaStreamDestroy(setOptProfileStream));
+
     if (iEnv.profiler)
     {
-        iEnv.context.front()->setProfiler(iEnv.profiler.get());
+        iEnv.contexts.front()->setProfiler(iEnv.profiler.get());
         // Always run reportToProfiler() after enqueue launch
-        iEnv.context.front()->setEnqueueEmitsProfile(false);
+        iEnv.contexts.front()->setEnqueueEmitsProfile(false);
     }
 
-    const int32_t nOptProfiles = iEnv.engine->getNbOptimizationProfiles();
-    const int32_t nBindings = iEnv.engine->getNbBindings();
-    const int32_t bindingsInProfile = nOptProfiles > 0 ? nBindings / nOptProfiles : 0;
-    const int32_t endBindingIndex = bindingsInProfile ? bindingsInProfile : iEnv.engine->getNbBindings();
-
-    if (nOptProfiles > 1)
-    {
-        sample::gLogWarning << "Multiple profiles are currently not supported. Running with one profile." << std::endl;
-    }
+    int32_t const endBindingIndex = engine->getNbIOTensors();
 
     // Make sure that the tensor names provided in command-line args actually exist in any of the engine bindings
     // to avoid silent typos.
-    if (!validateTensorNames(inference.shapes, iEnv.engine.get(), endBindingIndex))
+    if (!validateTensorNames(inference.shapes, engine, endBindingIndex))
     {
         sample::gLogError << "Invalid tensor names found in --shapes flag." << std::endl;
         return false;
     }
 
-    // Set all input dimensions before all bindings can be allocated
     for (int32_t b = 0; b < endBindingIndex; ++b)
     {
-        if (iEnv.engine->bindingIsInput(b))
+        auto const& name = engine->getIOTensorName(b);
+        auto const& mode = engine->getTensorIOMode(name);
+        if (mode == TensorIOMode::kINPUT)
         {
-            auto dims = iEnv.context.front()->getBindingDimensions(b);
-            const bool isScalar = dims.nbDims == 0;
-            const bool isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; })
-                || iEnv.engine->isShapeBinding(b);
-            if (isDynamicInput)
+            Dims const dims = iEnv.contexts.front()->getTensorShape(name);
+            bool isShapeInferenceIO{false};
+            isShapeInferenceIO = engine->isShapeInferenceIO(name);
+            bool const hasRuntimeDim = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; });
+            auto const shape = findPlausible(inference.shapes, name);
+            if (hasRuntimeDim || isShapeInferenceIO)
             {
-                auto shape = inference.shapes.find(iEnv.engine->getBindingName(b));
+                // Set shapeData to either dimensions of the input (if it has a dynamic shape)
+                // or set to values of the input (if it is an input shape tensor).
+                std::vector<int32_t> shapeData;
 
-                std::vector<int32_t> staticDims;
                 if (shape == inference.shapes.end())
                 {
-                    // If no shape is provided, set dynamic dimensions to 1.
-                    constexpr int32_t DEFAULT_DIMENSION = 1;
-                    if (iEnv.engine->isShapeBinding(b))
+                    // No information provided. Use default value for missing data.
+                    constexpr int32_t kDEFAULT_VALUE = 1;
+                    if (isShapeInferenceIO)
                     {
-                        if (isScalar)
-                        {
-                            staticDims.push_back(1);
-                        }
-                        else
-                        {
-                            staticDims.resize(dims.d[0]);
-                            std::fill(staticDims.begin(), staticDims.end(), DEFAULT_DIMENSION);
-                        }
+                        // Set shape tensor to all ones.
+                        shapeData.assign(volume(dims, 0, dims.nbDims), kDEFAULT_VALUE);
+                        sample::gLogWarning << "Values missing for input shape tensor: " << name
+                                            << "Automatically setting values to: " << shapeData << std::endl;
                     }
                     else
                     {
-                        staticDims.resize(dims.nbDims);
-                        std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(),
-                            [&](int32_t dimension) { return dimension >= 0 ? dimension : DEFAULT_DIMENSION; });
+                        // Use default value for unspecified runtime dimensions.
+                        shapeData.resize(dims.nbDims);
+                        std::transform(dims.d, dims.d + dims.nbDims, shapeData.begin(),
+                            [&](int32_t dimension) { return dimension >= 0 ? dimension : kDEFAULT_VALUE; });
+                        sample::gLogWarning << "Shape missing for input with dynamic shape: " << name
+                                            << "Automatically setting shape to: " << shapeData << std::endl;
                     }
-                    sample::gLogWarning << "Dynamic dimensions required for input: " << iEnv.engine->getBindingName(b)
-                                        << ", but no shapes were provided. Automatically overriding shape to: "
-                                        << staticDims << std::endl;
                 }
-                else if (inference.inputs.count(shape->first) && iEnv.engine->isShapeBinding(b))
+                else if (inference.inputs.count(shape->first) && isShapeInferenceIO)
                 {
-                    if (isScalar || dims.nbDims == 1)
-                    {
-                        // Load shape tensor from file.
-                        size_t const size = isScalar ? 1 : dims.d[0];
-                        staticDims.resize(size);
-                        auto const& filename = inference.inputs.at(shape->first);
-                        auto dst = reinterpret_cast<char*>(staticDims.data());
-                        loadFromFile(filename, dst, size * sizeof(decltype(staticDims)::value_type));
-                    }
-                    else
-                    {
-                        sample::gLogWarning << "Cannot load shape tensor " << shape->first << " from file, "
-                                            << "ND-Shape isn't supported yet" << std::endl;
-                        // Fallback
-                        staticDims = shape->second;
-                    }
+                    // Load shape tensor from file.
+                    int64_t const size = volume(dims, 0, dims.nbDims);
+                    shapeData.resize(size);
+                    auto const& filename = inference.inputs.at(shape->first);
+                    auto dst = reinterpret_cast<char*>(shapeData.data());
+                    loadFromFile(filename, dst, size * sizeof(decltype(shapeData)::value_type));
                 }
                 else
                 {
-                    staticDims = shape->second;
+                    shapeData = shape->second;
+                }
+
+                int32_t* shapeTensorData{nullptr};
+                if (isShapeInferenceIO)
+                {
+                    // Save the data in iEnv, in a way that it's address does not change
+                    // before enqueueV3 is called.
+                    iEnv.inputShapeTensorValues.emplace_back(shapeData);
+                    shapeTensorData = iEnv.inputShapeTensorValues.back().data();
                 }
 
-                for (auto& c : iEnv.context)
+                for (auto& c : iEnv.contexts)
                 {
-                    if (iEnv.engine->isShapeBinding(b))
+                    if (isShapeInferenceIO)
                     {
-                        if (!c->setInputShapeBinding(b, staticDims.data()))
+                        sample::gLogInfo << "Set input shape tensor " << name << " to: " << shapeData << std::endl;
+                        if (!c->setTensorAddress(name, shapeTensorData))
                         {
                             return false;
                         }
                     }
                     else
                     {
-                        if (!c->setBindingDimensions(b, toDims(staticDims)))
+                        sample::gLogInfo << "Set shape of input tensor " << name << " to: " << shapeData
+                                            << std::endl;
+                        if (!c->setInputShape(name, toDims(shapeData)))
                         {
                             return false;
                         }
                     }
                 }
             }
+            else if (nbOptProfiles && shape != inference.shapes.end())
+            {
+                // Check if the provided shape matches the static dimensions in the engine.
+                for (auto& c : iEnv.contexts)
+                {
+                    if (!c->setInputShape(name, toDims(shape->second)))
+                    {
+                        sample::gLogError << "The engine was built with static shapes for input tensor " << name
+                                          << " but the provided shapes do not match the static shapes!" << std::endl;
+                        return false;
+                    }
+                }
+            }
         }
     }
 
-    auto* engine = iEnv.engine.get();
-    auto const* context = iEnv.context.front().get();
-    int32_t const batch = engine->hasImplicitBatchDimension() ? inference.batch : 1;
-    return FillStdBindings(engine, context, inference.inputs, iEnv.bindings, batch, endBindingIndex)();
+    // Create Debug Listener and turn on debug states if client requested dumping debug tensors.
+    if (!inference.debugTensorFileNames.empty())
+    {
+        iEnv.listener.reset(new DebugTensorWriter(inference.debugTensorFileNames));
+        iEnv.contexts.front()->setDebugListener(iEnv.listener.get());
+        for (auto const& s : inference.debugTensorFileNames)
+        {
+            iEnv.contexts.front()->setTensorDebugState(s.first.c_str(), true);
+        }
+    }
+
+    if (!allocateContextMemory(iEnv, inference))
+    {
+        return false;
+    }
+
+    auto const* context = iEnv.contexts.front().get();
+    return FillStdBindings(
+        engine, context, inference.inputs, iEnv.bindings, 1, endBindingIndex, inference.optProfileIndex)();
 }
 
+TaskInferenceEnvironment::TaskInferenceEnvironment(
+    std::string engineFile, InferenceOptions inference, int32_t deviceId, int32_t DLACore, int32_t bs)
+    : iOptions(inference)
+    , device(deviceId)
+    , batch(bs)
+{
+    BuildEnvironment bEnv(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults());
+    loadEngineToBuildEnv(engineFile, bEnv, sample::gLogError);
+    std::unique_ptr<InferenceEnvironment> tmp(new InferenceEnvironment(bEnv));
+    iEnv = std::move(tmp);
+
+    cudaCheck(cudaSetDevice(device));
+    SystemOptions system{};
+    system.device = device;
+    system.DLACore = DLACore;
+    if (!setUpInference(*iEnv, iOptions, system))
+    {
+        sample::gLogError << "Inference set up failed" << std::endl;
+    }
+}
 namespace
 {
 
@@ -353,74 +562,60 @@ struct SyncStruct
 
 struct Enqueue
 {
-    explicit Enqueue(nvinfer1::IExecutionContext& context, void** buffers)
+    explicit Enqueue(nvinfer1::IExecutionContext& context)
         : mContext(context)
-        , mBuffers(buffers)
     {
     }
 
     nvinfer1::IExecutionContext& mContext;
-    void** mBuffers{};
 };
 
 //!
-//! \class EnqueueImplicit
-//! \brief Functor to enqueue inference with implict batch
+//! \class EnqueueExplicit
+//! \brief Functor to enqueue inference with explict batch
 //!
-class EnqueueImplicit : private Enqueue
+class EnqueueExplicit : private Enqueue
 {
 
 public:
-    explicit EnqueueImplicit(nvinfer1::IExecutionContext& context, void** buffers, int32_t batch)
-        : Enqueue(context, buffers)
-        , mBatch(batch)
+    explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, Bindings const& bindings)
+        : Enqueue(context)
+        , mBindings(bindings)
     {
+        ASSERT(mBindings.setTensorAddresses(mContext));
     }
 
     bool operator()(TrtCudaStream& stream) const
     {
-        if (mContext.enqueue(mBatch, mBuffers, stream.get(), nullptr))
+        try
         {
-            // Collecting layer timing info from current profile index of execution context
-            if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler())
+            bool const result = mContext.enqueueV3(stream.get());
+            // Collecting layer timing info from current profile index of execution context, except under capturing
+            // mode.
+            if (!isStreamCapturing(stream) && mContext.getProfiler() && !mContext.getEnqueueEmitsProfile()
+                && !mContext.reportToProfiler())
             {
-                gLogWarning << "Failed to collect layer timing info from previous enqueue()" << std::endl;
+                gLogWarning << "Failed to collect layer timing info from previous enqueueV3()" << std::endl;
             }
-            return true;
+            return result;
+        }
+        catch (const std::exception&)
+        {
+            return false;
         }
         return false;
     }
 
 private:
-    int32_t mBatch;
-};
-
-//!
-//! \class EnqueueExplicit
-//! \brief Functor to enqueue inference with explict batch
-//!
-class EnqueueExplicit : private Enqueue
-{
-
-public:
-    explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, void** buffers)
-        : Enqueue(context, buffers)
+    // Helper function to check if a stream is in capturing mode.
+    bool isStreamCapturing(TrtCudaStream& stream) const
     {
+        cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone};
+        cudaCheck(cudaStreamIsCapturing(stream.get(), &status));
+        return status != cudaStreamCaptureStatusNone;
     }
 
-    bool operator()(TrtCudaStream& stream) const
-    {
-        if (mContext.enqueueV2(mBuffers, stream.get(), nullptr))
-        {
-            // Collecting layer timing info from current profile index of execution context
-            if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler())
-            {
-                gLogWarning << "Failed to collect layer timing info from previous enqueueV2()" << std::endl;
-            }
-            return true;
-        }
-        return false;
-    }
+    Bindings const& mBindings;
 };
 
 //!
@@ -442,7 +637,7 @@ class EnqueueGraph
         if (mGraph.launch(stream))
         {
             // Collecting layer timing info from current profile index of execution context
-            if (mContext.getProfiler() && !mContext.reportToProfiler())
+            if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler())
             {
                 gLogWarning << "Failed to collect layer timing info from previous CUDA graph launch" << std::endl;
             }
@@ -456,29 +651,24 @@ class EnqueueGraph
 };
 
 //!
-//! \class EnqueueSafe
-//! \brief Functor to enqueue safe execution context
+//! \class EnqueueGraphSafe
+//! \brief Functor to enqueue inference from CUDA Graph
 //!
-class EnqueueSafe
+class EnqueueGraphSafe
 {
+
 public:
-    explicit EnqueueSafe(nvinfer1::safe::IExecutionContext& context, void** buffers)
-        : mContext(context)
-        , mBuffers(buffers)
+    explicit EnqueueGraphSafe(TrtCudaGraph& graph)
+        : mGraph(graph)
     {
     }
 
     bool operator()(TrtCudaStream& stream) const
     {
-        if (mContext.enqueueV2(mBuffers, stream.get(), nullptr))
-        {
-            return true;
-        }
-        return false;
+        return mGraph.launch(stream);
     }
 
-    nvinfer1::safe::IExecutionContext& mContext;
-    void** mBuffers{};
+    TrtCudaGraph& mGraph;
 };
 
 using EnqueueFunction = std::function<bool(TrtCudaStream&)>;
@@ -512,12 +702,11 @@ using EnqueueTimes = std::array<TimePoint, 2>;
 //! \class Iteration
 //! \brief Inference iteration and streams management
 //!
-template <class ContextType>
 class Iteration
 {
 
 public:
-    Iteration(int32_t id, const InferenceOptions& inference, ContextType& context, Bindings& bindings)
+    Iteration(int32_t id, InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings)
         : mBindings(bindings)
         , mStreamId(id)
         , mDepth(1 + inference.overlap)
@@ -546,7 +735,7 @@ class Iteration
         if (!skipTransfers)
         {
             record(EventType::kINPUT_S, StreamType::kINPUT);
-            mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
+            setInputData(false);
             record(EventType::kINPUT_E, StreamType::kINPUT);
             wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute
         }
@@ -564,7 +753,7 @@ class Iteration
         {
             wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA
             record(EventType::kOUTPUT_S, StreamType::kOUTPUT);
-            mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
+            fetchOutputData(false);
             record(EventType::kOUTPUT_E, StreamType::kOUTPUT);
         }
 
@@ -574,7 +763,7 @@ class Iteration
     }
 
     float sync(
-        const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, std::vector<InferenceTrace>& trace, bool skipTransfers)
+        TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector<InferenceTrace>& trace, bool skipTransfers)
     {
         if (mActive[mNext])
         {
@@ -594,7 +783,7 @@ class Iteration
     }
 
     void syncAll(
-        const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, std::vector<InferenceTrace>& trace, bool skipTransfers)
+        TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector<InferenceTrace>& trace, bool skipTransfers)
     {
         for (int32_t d = 0; d < mDepth; ++d)
         {
@@ -608,14 +797,24 @@ class Iteration
         getStream(StreamType::kINPUT).wait(gpuStart);
     }
 
-    void setInputData()
+    void setInputData(bool sync)
     {
         mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
+        // additional sync to avoid overlapping with inference execution.
+        if (sync)
+        {
+            getStream(StreamType::kINPUT).synchronize();
+        }
     }
 
-    void fetchOutputData()
+    void fetchOutputData(bool sync)
     {
         mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
+        // additional sync to avoid overlapping with inference execution.
+        if (sync)
+        {
+            getStream(StreamType::kOUTPUT).synchronize();
+        }
     }
 
 private:
@@ -655,12 +854,16 @@ class Iteration
         getStream(s).wait(getEvent(e));
     }
 
-    InferenceTrace getTrace(const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, bool skipTransfers)
+    InferenceTrace getTrace(TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, bool skipTransfers)
     {
-        float is = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart;
-        float ie = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart;
-        float os = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart;
-        float oe = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart;
+        float is
+            = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart;
+        float ie
+            = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart;
+        float os
+            = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart;
+        float oe
+            = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart;
 
         return InferenceTrace(mStreamId,
             std::chrono::duration<float, std::milli>(getEnqueueTime(true) - cpuStart).count(),
@@ -668,19 +871,22 @@ class Iteration
             getEvent(EventType::kCOMPUTE_S) - gpuStart, getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe);
     }
 
-    void createEnqueueFunction(const InferenceOptions& inference, nvinfer1::IExecutionContext& context, Bindings& /*bindings*/)
+    void createEnqueueFunction(
+        InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings)
     {
-        if (inference.batch)
-            mEnqueue = EnqueueFunction(EnqueueImplicit(context, mBindings.getDeviceBuffers(), inference.batch));
-        else
-            mEnqueue = EnqueueFunction(EnqueueExplicit(context, mBindings.getDeviceBuffers()));
-
+        mEnqueue = EnqueueFunction(EnqueueExplicit(context, mBindings));
         if (inference.graph)
         {
+            sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl;
+
             TrtCudaStream& stream = getStream(StreamType::kCOMPUTE);
-            // Avoid capturing initialization calls by executing the enqueue function at least once before starting CUDA graph capture.
-            const auto ret = mEnqueue(stream);
-            assert(ret);
+            // Avoid capturing initialization calls by executing the enqueue function at least
+            // once before starting CUDA graph capture.
+            auto const ret = mEnqueue(stream);
+            if (!ret)
+            {
+                throw std::runtime_error("Inference enqueue failed.");
+            }
             stream.synchronize();
 
             mGraph.beginCapture(stream);
@@ -690,6 +896,7 @@ class Iteration
             {
                 mGraph.endCapture(stream);
                 mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph));
+                sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl;
             }
             else
             {
@@ -706,11 +913,6 @@ class Iteration
         }
     }
 
-    void createEnqueueFunction(const InferenceOptions&, nvinfer1::safe::IExecutionContext& context, Bindings&)
-    {
-        mEnqueue = EnqueueFunction(EnqueueSafe(context, mBindings.getDeviceBuffers()));
-    }
-
     Bindings& mBindings;
 
     TrtCudaGraph mGraph;
@@ -726,23 +928,44 @@ class Iteration
 
     int32_t enqueueStart{0};
     std::vector<EnqueueTimes> mEnqueueTimes;
-    ContextType* mContext{nullptr};
+    nvinfer1::IExecutionContext* mContext{nullptr};
 };
 
-template <class ContextType>
-bool inferenceLoop(std::vector<std::unique_ptr<Iteration<ContextType>>>& iStreams, const TimePoint& cpuStart,
-    const TrtCudaEvent& gpuStart, int iterations, float maxDurationMs, float warmupMs,
+bool inferenceLoop(std::vector<std::unique_ptr<Iteration>>& iStreams, TimePoint const& cpuStart,
+    TrtCudaEvent const& gpuStart, int iterations, float maxDurationMs, float warmupMs,
     std::vector<InferenceTrace>& trace, bool skipTransfers, float idleMs)
 {
     float durationMs = 0;
     int32_t skip = 0;
 
+    if (maxDurationMs == -1.F)
+    {
+        sample::gLogWarning << "--duration=-1 is specified, inference will run in an endless loop until"
+                            << " aborted with CTRL-C (SIGINT)" << std::endl;
+        while (true)
+        {
+            for (auto& s : iStreams)
+            {
+                if (!s->query(skipTransfers))
+                {
+                    return false;
+                }
+            }
+            for (auto& s : iStreams)
+            {
+                s->sync(cpuStart, gpuStart, trace, skipTransfers);
+            }
+        }
+    }
+
     for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i)
     {
         for (auto& s : iStreams)
         {
             if (!s->query(skipTransfers))
+            {
                 return false;
+            }
         }
         for (auto& s : iStreams)
         {
@@ -751,12 +974,15 @@ bool inferenceLoop(std::vector<std::unique_ptr<Iteration<ContextType>>>& iStream
         if (durationMs < warmupMs) // Warming up
         {
             if (durationMs) // Skip complete iterations
+            {
                 ++skip;
-
+            }
             continue;
         }
         if (idleMs != 0.F)
+        {
             std::this_thread::sleep_for(std::chrono::duration<float, std::milli>(idleMs));
+        }
     }
     for (auto& s : iStreams)
     {
@@ -765,74 +991,81 @@ bool inferenceLoop(std::vector<std::unique_ptr<Iteration<ContextType>>>& iStream
     return true;
 }
 
-template <class ContextType>
-void inferenceExecution(const InferenceOptions& inference, InferenceEnvironment& iEnv, SyncStruct& sync,
-    const int32_t threadIdx, const int32_t streamsPerThread, int32_t device, std::vector<InferenceTrace>& trace)
+void inferenceExecution(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync,
+    int32_t const threadIdx, int32_t const streamsPerThread, int32_t device,
+    std::vector<InferenceTrace>& trace) noexcept
 {
-    float warmupMs = inference.warmup;
-    float durationMs = inference.duration * 1000.F + warmupMs;
-
-    cudaCheck(cudaSetDevice(device));
-
-    std::vector<std::unique_ptr<Iteration<ContextType>>> iStreams;
-
-    for (int32_t s = 0; s < streamsPerThread; ++s)
+    try
     {
-        const int32_t streamId{threadIdx * streamsPerThread + s};
-        auto* iteration = new Iteration<ContextType>(
-            streamId, inference, *iEnv.template getContext<ContextType>(streamId), *iEnv.bindings[streamId]);
-        if (inference.skipTransfers)
+        float warmupMs = inference.warmup;
+        float durationMs = -1.F;
+        if (inference.duration != -1.F)
         {
-            iteration->setInputData();
+            durationMs = inference.duration * 1000.F + warmupMs;
         }
-        iStreams.emplace_back(iteration);
-    }
 
-    for (auto& s : iStreams)
-    {
-        s->wait(sync.gpuStart);
-    }
+        cudaCheck(cudaSetDevice(device));
 
-    std::vector<InferenceTrace> localTrace;
-    if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs, localTrace,
-            inference.skipTransfers, inference.idle))
-    {
-        iEnv.error = true;
-    }
+        std::vector<std::unique_ptr<Iteration>> iStreams;
+
+        for (int32_t s = 0; s < streamsPerThread; ++s)
+        {
+            int32_t const streamId{threadIdx * streamsPerThread + s};
+            auto* iteration = new Iteration(streamId, inference, *iEnv.getContext(streamId), *iEnv.bindings[streamId]);
+            if (inference.skipTransfers)
+            {
+                iteration->setInputData(true);
+            }
+            iStreams.emplace_back(iteration);
+        }
 
-    if (inference.skipTransfers)
-    {
         for (auto& s : iStreams)
         {
-            s->fetchOutputData();
+            s->wait(sync.gpuStart);
         }
-    }
 
-    sync.mutex.lock();
-    trace.insert(trace.end(), localTrace.begin(), localTrace.end());
-    sync.mutex.unlock();
-}
+        std::vector<InferenceTrace> localTrace;
+        if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs,
+                localTrace, inference.skipTransfers, inference.idle))
+        {
+            sync.mutex.lock();
+            iEnv.error = true;
+            sync.mutex.unlock();
+        }
 
-inline std::thread makeThread(const InferenceOptions& inference, InferenceEnvironment& iEnv, SyncStruct& sync,
-    int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector<InferenceTrace>& trace)
-{
+        if (inference.skipTransfers)
+        {
+            for (auto& s : iStreams)
+            {
+                s->fetchOutputData(true);
+            }
+        }
 
-    if (iEnv.safe)
+        sync.mutex.lock();
+        trace.insert(trace.end(), localTrace.begin(), localTrace.end());
+        sync.mutex.unlock();
+    }
+    catch (...)
     {
-        ASSERT(sample::hasSafeRuntime());
-        return std::thread(inferenceExecution<nvinfer1::safe::IExecutionContext>, std::cref(inference), std::ref(iEnv),
-            std::ref(sync), threadIdx, streamsPerThread, device, std::ref(trace));
+        sync.mutex.lock();
+        iEnv.error = true;
+        sync.mutex.unlock();
     }
+}
 
-    return std::thread(inferenceExecution<nvinfer1::IExecutionContext>, std::cref(inference), std::ref(iEnv),
-        std::ref(sync), threadIdx, streamsPerThread, device, std::ref(trace));
+inline std::thread makeThread(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync,
+    int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector<InferenceTrace>& trace)
+{
+    return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), std::ref(sync), threadIdx,
+        streamsPerThread, device, std::ref(trace));
 }
 
 } // namespace
 
 bool runInference(
-    const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector<InferenceTrace>& trace)
+    InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector<InferenceTrace>& trace)
 {
+    SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
     cudaCheck(cudaProfilerStart());
 
     trace.resize(0);
@@ -846,8 +1079,8 @@ bool runInference(
     // When multiple streams are used, trtexec can run inference in two modes:
     // (1) if inference.threads is true, then run each stream on each thread.
     // (2) if inference.threads is false, then run all streams on the same thread.
-    const int32_t numThreads = inference.threads ? inference.streams : 1;
-    const int32_t streamsPerThread = inference.threads ? 1 : inference.streams;
+    int32_t const numThreads = inference.threads ? inference.infStreams : 1;
+    int32_t const streamsPerThread = inference.threads ? 1 : inference.infStreams;
 
     std::vector<std::thread> threads;
     for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx)
@@ -861,12 +1094,47 @@ bool runInference(
 
     cudaCheck(cudaProfilerStop());
 
-    auto cmpTrace = [](const InferenceTrace& a, const InferenceTrace& b) { return a.h2dStart < b.h2dStart; };
+    auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; };
     std::sort(trace.begin(), trace.end(), cmpTrace);
 
     return !iEnv.error;
 }
 
+bool runMultiTasksInference(std::vector<std::unique_ptr<TaskInferenceEnvironment>>& tEnvList)
+{
+    cudaCheck(cudaProfilerStart());
+    cudaSetDeviceFlags(cudaDeviceScheduleSpin);
+
+    SyncStruct sync;
+    sync.sleep = 0;
+    sync.mainStream.sleep(&sync.sleep);
+    sync.cpuStart = getCurrentTime();
+    sync.gpuStart.record(sync.mainStream);
+
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < tEnvList.size(); ++i)
+    {
+        auto& tEnv = tEnvList[i];
+        threads.emplace_back(makeThread(
+            tEnv->iOptions, *(tEnv->iEnv), sync, /*threadIdx*/ 0, /*streamsPerThread*/ 1, tEnv->device, tEnv->trace));
+    }
+    for (auto& th : threads)
+    {
+        th.join();
+    }
+
+    cudaCheck(cudaProfilerStop());
+
+    auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; };
+    for (auto& tEnv : tEnvList)
+    {
+        std::sort(tEnv->trace.begin(), tEnv->trace.end(), cmpTrace);
+    }
+
+    return std::none_of(tEnvList.begin(), tEnvList.end(),
+        [](std::unique_ptr<TaskInferenceEnvironment>& tEnv) { return tEnv->iEnv->error; });
+}
+
 namespace
 {
 size_t reportGpuMemory()
@@ -889,36 +1157,31 @@ size_t reportGpuMemory()
 } // namespace
 
 //! Returns true if deserialization is slower than expected or fails.
-bool timeDeserialize(InferenceEnvironment& iEnv)
+bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys)
 {
     constexpr int32_t kNB_ITERS{20};
-    std::unique_ptr<nvinfer1::IRuntime> rt{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())};
-    std::unique_ptr<nvinfer1::ICudaEngine> engine;
+    std::unique_ptr<IRuntime> rt{createRuntime()};
+    std::unique_ptr<ICudaEngine> engine;
 
-    std::unique_ptr<nvinfer1::safe::IRuntime> safeRT{sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())};
-    std::unique_ptr<nvinfer1::safe::ICudaEngine> safeEngine;
-
-    if (iEnv.safe)
-    {
-        ASSERT(sample::hasSafeRuntime() && safeRT != nullptr);
-        safeRT->setErrorRecorder(&gRecorder);
-    }
+    SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
 
     auto timeDeserializeFn = [&]() -> float {
         bool deserializeOK{false};
         engine.reset(nullptr);
-        safeEngine.reset(nullptr);
         auto startClock = std::chrono::high_resolution_clock::now();
-        if (iEnv.safe)
-        {
-            safeEngine.reset(safeRT->deserializeCudaEngine(iEnv.engineBlob.data(), iEnv.engineBlob.size()));
-            deserializeOK = (safeEngine != nullptr);
-        }
-        else
+        SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
+
+        auto& reader = iEnv.engine.getFileReader();
+        reader.reset();
+        ASSERT(reader.isOpen());
+#if !TRT_WINML
+        for (auto const& pluginPath : sys.dynamicPlugins)
         {
-            engine.reset(rt->deserializeCudaEngine(iEnv.engineBlob.data(), iEnv.engineBlob.size()));
-            deserializeOK = (engine != nullptr);
+            rt->getPluginRegistry().loadLibrary(pluginPath.c_str());
         }
+#endif
+        engine.reset(rt->deserializeCudaEngine(reader));
+        deserializeOK = (engine != nullptr);
         auto endClock = std::chrono::high_resolution_clock::now();
         // return NAN if deserialization failed.
         return deserializeOK ? std::chrono::duration<float, std::milli>(endClock - startClock).count() : NAN;
@@ -935,7 +1198,7 @@ bool timeDeserialize(InferenceEnvironment& iEnv)
     sample::gLogInfo << "Begin deserialization engine timing..." << std::endl;
     float const first = timeDeserializeFn();
 
-    // Check if first deserialization suceeded.
+    // Check if first deserialization succeeded.
     if (std::isnan(first))
     {
         sample::gLogError << "Engine deserialization failed." << std::endl;
@@ -952,10 +1215,10 @@ bool timeDeserialize(InferenceEnvironment& iEnv)
     {
         totalTime += timeDeserializeFn();
     }
-    const auto averageTime = totalTime / kNB_ITERS;
+    auto const averageTime = totalTime / kNB_ITERS;
     // reportGpuMemory sometimes reports zero after a single deserialization of a small engine,
     // so use the size of memory for all the iterations.
-    const auto totalEngineSizeGpu = reportGpuMemory();
+    auto const totalEngineSizeGpu = reportGpuMemory();
     sample::gLogInfo << "Total deserialization time = " << totalTime << " milliseconds in " << kNB_ITERS
                      << " iterations, average time = " << averageTime << " milliseconds, first time = " << first
                      << " milliseconds." << std::endl;
@@ -965,8 +1228,8 @@ bool timeDeserialize(InferenceEnvironment& iEnv)
     // the average deserialization, return true, which means an error occurred.
     // The tolerance is set to 2x since the deserialization time is quick and susceptible
     // to caching issues causing problems in the first timing.
-    const auto tolerance = 2.0F;
-    const bool isSlowerThanExpected = first > averageTime * tolerance;
+    auto const tolerance = 2.0F;
+    bool const isSlowerThanExpected = first > averageTime * tolerance;
     if (isSlowerThanExpected)
     {
         sample::gLogInfo << "First deserialization time divided by average time is " << (first / averageTime)
@@ -975,16 +1238,385 @@ bool timeDeserialize(InferenceEnvironment& iEnv)
     return isSlowerThanExpected;
 }
 
-std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format)
+std::string getLayerInformation(
+    nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format)
 {
-    auto runtime = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger()));
-    auto inspector = std::unique_ptr<nvinfer1::IEngineInspector>(iEnv.engine->createEngineInspector());
-    if (!iEnv.context.empty())
+    auto runtime = std::unique_ptr<IRuntime>{createRuntime()};
+    auto inspector = std::unique_ptr<IEngineInspector>(engine->createEngineInspector());
+    if (context != nullptr)
     {
-        inspector->setExecutionContext(iEnv.context.front().get());
+        inspector->setExecutionContext(context);
     }
     std::string result = inspector->getEngineInformation(format);
     return result;
 }
 
+void Binding::fill(std::string const& fileName)
+{
+    loadFromFile(fileName, static_cast<char*>(buffer->getHostBuffer()), buffer->getSize());
+}
+
+void Binding::fill()
+{
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kBOOL:
+    {
+        fillBuffer<bool>(buffer->getHostBuffer(), volume, 0, 1);
+        break;
+    }
+    case nvinfer1::DataType::kINT32:
+    {
+        fillBuffer<int32_t>(buffer->getHostBuffer(), volume, -128, 127);
+        break;
+    }
+    case nvinfer1::DataType::kINT64:
+    {
+        fillBuffer<int64_t>(buffer->getHostBuffer(), volume, -128, 127);
+        break;
+    }
+    case nvinfer1::DataType::kINT8:
+    {
+        fillBuffer<int8_t>(buffer->getHostBuffer(), volume, -128, 127);
+        break;
+    }
+    case nvinfer1::DataType::kFLOAT:
+    {
+        fillBuffer<float>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
+        break;
+    }
+    case nvinfer1::DataType::kHALF:
+    {
+        fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
+        break;
+    }
+    case nvinfer1::DataType::kBF16:
+    {
+        fillBuffer<BFloat16>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
+        break;
+    }
+    case nvinfer1::DataType::kUINT8:
+    {
+        fillBuffer<uint8_t>(buffer->getHostBuffer(), volume, 0, 255);
+        break;
+    }
+    case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported");
+    case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported");
+    }
+}
+
+void Binding::dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv,
+    std::string const separator /*= " "*/) const
+{
+    void* outputBuffer{};
+    if (outputAllocator != nullptr)
+    {
+        outputBuffer = outputAllocator->getBuffer()->getHostBuffer();
+        // Overwrite dimensions with those reported by the output allocator.
+        dims = outputAllocator->getFinalDims();
+        os << "Final shape is " << dims << " reported by the output allocator." << std::endl;
+    }
+    else
+    {
+        outputBuffer = buffer->getHostBuffer();
+    }
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kBOOL:
+    {
+        dumpBuffer<bool>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kINT32:
+    {
+        dumpBuffer<int32_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kINT8:
+    {
+        dumpBuffer<int8_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kFLOAT:
+    {
+        dumpBuffer<float>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kHALF:
+    {
+        dumpBuffer<__half>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kBF16:
+    {
+        dumpBuffer<BFloat16>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kUINT8:
+    {
+        dumpBuffer<uint8_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kINT64:
+    {
+        dumpBuffer<int64_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
+        break;
+    }
+    case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported");
+    case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported");
+    }
+}
+
+void Bindings::addBinding(TensorInfo const& tensorInfo, std::string const& fileName /*= ""*/)
+{
+    auto const b = tensorInfo.bindingIndex;
+    while (mBindings.size() <= static_cast<size_t>(b))
+    {
+        mBindings.emplace_back();
+        mDevicePointers.emplace_back();
+    }
+    mNames[tensorInfo.name] = b;
+    mBindings[b].isInput = tensorInfo.isInput;
+    mBindings[b].volume = tensorInfo.vol;
+    mBindings[b].dataType = tensorInfo.dataType;
+    if (tensorInfo.isDynamic)
+    {
+        ASSERT(!tensorInfo.isInput); // Only output shape can be possibly unknown because of DDS.
+        if (mBindings[b].outputAllocator == nullptr)
+        {
+            if (mUseManaged)
+            {
+                mBindings[b].outputAllocator.reset(new OutputAllocator(new UnifiedMirroredBuffer));
+            }
+            else
+            {
+                mBindings[b].outputAllocator.reset(new OutputAllocator(new DiscreteMirroredBuffer));
+            }
+        }
+    }
+    else
+    {
+        if (mBindings[b].buffer == nullptr)
+        {
+            if (mUseManaged)
+            {
+                mBindings[b].buffer.reset(new UnifiedMirroredBuffer);
+            }
+            else
+            {
+                mBindings[b].buffer.reset(new DiscreteMirroredBuffer);
+            }
+        }
+        // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
+        // even for empty tensors, so allocate a dummy byte.
+        if (tensorInfo.vol == 0)
+        {
+            mBindings[b].buffer->allocate(1);
+        }
+        else
+        {
+            mBindings[b].buffer->allocate(
+                static_cast<size_t>(tensorInfo.vol) * static_cast<size_t>(dataTypeSize(tensorInfo.dataType)));
+        }
+        mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer();
+    }
+    if (tensorInfo.isInput)
+    {
+        if (fileName.empty())
+        {
+            fill(b);
+        }
+        else
+        {
+            fill(b, fileName);
+        }
+    }
+}
+
+void** Bindings::getDeviceBuffers()
+{
+    return mDevicePointers.data();
+}
+
+void Bindings::transferInputToDevice(TrtCudaStream& stream)
+{
+    for (auto& b : mNames)
+    {
+        if (mBindings[b.second].isInput)
+        {
+            mBindings[b.second].buffer->hostToDevice(stream);
+        }
+    }
+}
+
+void Bindings::transferOutputToHost(TrtCudaStream& stream)
+{
+    for (auto& b : mNames)
+    {
+        if (!mBindings[b.second].isInput)
+        {
+            if (mBindings[b.second].outputAllocator != nullptr)
+            {
+                mBindings[b.second].outputAllocator->getBuffer()->deviceToHost(stream);
+            }
+            else
+            {
+                mBindings[b.second].buffer->deviceToHost(stream);
+            }
+        }
+    }
+}
+
+void Bindings::dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os,
+    std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const
+{
+    auto const tensorName = context.getEngine().getIOTensorName(binding);
+    Dims dims = context.getTensorShape(tensorName);
+    Dims strides = context.getTensorStrides(tensorName);
+    int32_t vectorDim = context.getEngine().getTensorVectorizedDim(tensorName);
+    int32_t const spv = context.getEngine().getTensorComponentsPerElement(tensorName);
+
+    mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator);
+}
+
+namespace
+{
+
+std::string genFilenameSafeString(std::string const& s)
+{
+    std::string res = s;
+    static std::string const allowedSpecialChars{"._-,"};
+    for (auto& c : res)
+    {
+        if (!isalnum(c) && allowedSpecialChars.find(c) == std::string::npos)
+        {
+            c = '_';
+        }
+    }
+    return res;
+}
+
+Dims getBindingDimensions(nvinfer1::IExecutionContext const& context, std::string const& name)
+{
+    return context.getTensorShape(name.c_str());
+}
+} // namespace
+
+void Bindings::dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const
+{
+    os << "Dumping I/O Bindings to RAW Files:" << std::endl;
+    for (auto const& n : mNames)
+    {
+        auto name = n.first;
+        auto bIndex = n.second;
+        auto const& binding = mBindings[bIndex];
+        void* outputBuffer{};
+        if (binding.outputAllocator != nullptr)
+        {
+            outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer();
+        }
+        else
+        {
+            outputBuffer = binding.buffer->getHostBuffer();
+        }
+
+        Dims dims = getBindingDimensions(context, name);
+        std::string dimsStr;
+        std::string dotStr;
+
+        for (int32_t i = 0; i < dims.nbDims; i++)
+        {
+            dimsStr += dotStr + std::to_string(dims.d[i]);
+            dotStr = ".";
+        }
+
+        std::string const bindingTypeStr = (binding.isInput ? "input" : "output");
+
+        std::stringstream fileName;
+        fileName << genFilenameSafeString(name) << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType
+                 << ".raw";
+
+        os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType
+           << " and dimensions " << dimsStr << ") to " << fileName.str() << std::endl;
+
+        std::ofstream f(fileName.str(), std::ios::out | std::ios::binary);
+        ASSERT(f && "Cannot open file for write");
+        f.write(static_cast<char*>(outputBuffer), binding.volume * samplesCommon::elementSize(binding.dataType));
+        f.close();
+    }
+}
+
+void Bindings::dumpBindingDimensions(
+    std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const
+{
+    auto const dims = context.getTensorShape(name.c_str());
+    // Do not add a newline terminator, because the caller may be outputting a JSON string.
+    os << dims;
+}
+
+std::unordered_map<std::string, int> Bindings::getBindings(std::function<bool(Binding const&)> predicate) const
+{
+    std::unordered_map<std::string, int> bindings;
+    for (auto const& n : mNames)
+    {
+        auto const binding = n.second;
+        if (predicate(mBindings[binding]))
+        {
+            bindings.insert(n);
+        }
+    }
+    return bindings;
+}
+
+bool Bindings::setTensorAddresses(nvinfer1::IExecutionContext& context) const
+{
+    for (auto const& b : mNames)
+    {
+        auto const name = b.first.c_str();
+        auto const location = context.getEngine().getTensorLocation(name);
+        if (location == TensorLocation::kDEVICE)
+        {
+            if (mBindings[b.second].outputAllocator != nullptr)
+            {
+                if (!context.setOutputAllocator(name, mBindings[b.second].outputAllocator.get()))
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                if (!context.setTensorAddress(name, mDevicePointers[b.second]))
+                {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+bool DebugTensorWriter::processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type,
+    nvinfer1::Dims const& shape, char const* name, cudaStream_t stream)
+{
+    CHECK(cudaStreamSynchronize(stream));
+    // Store data from callback.
+    int64_t size = std::accumulate(shape.d, shape.d + shape.nbDims, 1LL, std::multiplies<int64_t>{})
+        * samplesCommon::elementSize(type);
+    std::vector<char> hostDataOut(size, 0);
+    CHECK(cudaMemcpy(hostDataOut.data(), addr, size, cudaMemcpyDeviceToHost));
+
+    auto it = mDebugTensorFileNames.find(name);
+    ASSERT(it != mDebugTensorFileNames.end());
+    std::string fileName = it->second;
+
+    std::ofstream f(fileName, std::ios::out | std::ios::binary);
+    ASSERT(f && "Cannot open file for write");
+    sample::gLogInfo << "Writing to file " << fileName << " for debug tensor " << name << std::endl;
+    f.write(hostDataOut.data(), size);
+    f.close();
+
+    CHECK(cudaStreamSynchronize(stream));
+    return true;
+}
+
 } // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleInference.h b/src/Detector/tensorrt_yolo/common/sampleInference.h
index 1c21f592..d9ebed92 100644
--- a/src/Detector/tensorrt_yolo/common/sampleInference.h
+++ b/src/Detector/tensorrt_yolo/common/sampleInference.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,76 +18,243 @@
 #ifndef TRT_SAMPLE_INFERENCE_H
 #define TRT_SAMPLE_INFERENCE_H
 
+#include "sampleDevice.h"
+#include "sampleEngines.h"
 #include "sampleReporting.h"
 #include "sampleUtils.h"
 
+#include <functional>
 #include <iostream>
+#include <list>
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "NvInfer.h"
+namespace sample
+{
 
-#if (NV_TENSORRT_MAJOR > 7)
+// IDebugListener class for writing debug tensors to output file.
+class DebugTensorWriter : public nvinfer1::IDebugListener
+{
+public:
+    DebugTensorWriter(std::unordered_map<std::string, std::string> fileNames)
+        : mDebugTensorFileNames(fileNames)
+    {
+    }
 
-#include "NvInferSafeRuntime.h"
+    bool processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type,
+        nvinfer1::Dims const& shape, char const* name, cudaStream_t stream) override;
 
-namespace sample
-{
+private:
+    std::unordered_map<std::string, std::string> mDebugTensorFileNames;
+};
 
 struct InferenceEnvironment
 {
-    TrtUniquePtr<nvinfer1::ICudaEngine> engine;
+    InferenceEnvironment() = delete;
+    InferenceEnvironment(InferenceEnvironment const& other) = delete;
+    InferenceEnvironment(InferenceEnvironment&& other) = delete;
+    InferenceEnvironment(BuildEnvironment& bEnv) : engine(std::move(bEnv.engine)), safe(bEnv.engine.isSafe())
+    {
+    }
+
+    LazilyDeserializedEngine engine;
     std::unique_ptr<Profiler> profiler;
-    std::vector<TrtUniquePtr<nvinfer1::IExecutionContext>> context;
+    std::vector<std::unique_ptr<nvinfer1::IExecutionContext>> contexts;
+    std::vector<TrtDeviceBuffer>
+        deviceMemory; //< Device memory used for inference when the allocation strategy is not static.
     std::vector<std::unique_ptr<Bindings>> bindings;
+    std::unique_ptr<DebugTensorWriter> listener;
     bool error{false};
 
-    std::vector<uint8_t> engineBlob;
-
     bool safe{false};
-    std::unique_ptr<nvinfer1::safe::ICudaEngine> safeEngine;
-    std::vector<std::unique_ptr<nvinfer1::safe::IExecutionContext>> safeContext;
 
-    template <class ContextType>
-    inline ContextType* getContext(int32_t streamIdx);
+    inline nvinfer1::IExecutionContext* getContext(int32_t streamIdx);
+
+    //! Storage for input shape tensors.
+    //!
+    //! It's important that the addresses of the data do not change between the calls to
+    //! setTensorAddress/setInputShape (which tells TensorRT where the input shape tensor is)
+    //! and enqueueV3 (when TensorRT might use the input shape tensor).
+    //!
+    //! The input shape tensors could alternatively be handled via member bindings,
+    //! but it simplifies control-flow to store the data here since it's shared across
+    //! the bindings.
+    std::list<std::vector<int32_t>> inputShapeTensorValues;
 };
 
-template <>
 inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx)
 {
-    return context[streamIdx].get();
-}
-
-template <>
-inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx)
-{
-    return safeContext[streamIdx].get();
+    return contexts[streamIdx].get();
 }
 
 //!
 //! \brief Set up contexts and bindings for inference
 //!
-bool setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference);
+bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system);
 
 //!
 //! \brief Deserialize the engine and time how long it takes.
 //!
-bool timeDeserialize(InferenceEnvironment& iEnv);
+bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys);
 
 //!
 //! \brief Run inference and collect timing, return false if any error hit during inference
 //!
 bool runInference(
-    const InferenceOptions& inference, InferenceEnvironment& iEnv, int32_t device, std::vector<InferenceTrace>& trace);
+    InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector<InferenceTrace>& trace);
 
 //!
 //! \brief Get layer information of the engine.
 //!
-std::string getLayerInformation(const InferenceEnvironment& iEnv, nvinfer1::LayerInformationFormat format);
+std::string getLayerInformation(
+    nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format);
 
-} // namespace sample
+struct Binding
+{
+    bool isInput{false};
+    std::unique_ptr<IMirroredBuffer> buffer;
+    std::unique_ptr<OutputAllocator> outputAllocator;
+    int64_t volume{0};
+    nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT};
+
+    void fill(std::string const& fileName);
+
+    void fill();
+
+    void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv,
+        std::string const separator = " ") const;
+};
+
+struct TensorInfo
+{
+    int32_t bindingIndex{-1};
+    char const* name{nullptr};
+    nvinfer1::Dims dims{};
+    bool isDynamic{};
+    int32_t comps{-1};
+    nvinfer1::Dims strides{};
+    int32_t vectorDimIndex{-1};
+    bool isInput{};
+    nvinfer1::DataType dataType{};
+    int64_t vol{-1};
+
+    void updateVolume(int32_t batch)
+    {
+        vol = volume(dims, strides, vectorDimIndex, comps, batch);
+    }
+};
+
+class Bindings
+{
+public:
+    Bindings() = delete;
+    explicit Bindings(bool useManaged)
+        : mUseManaged(useManaged)
+    {
+    }
+
+    void addBinding(TensorInfo const& tensorInfo, std::string const& fileName = "");
 
-#endif
+    void** getDeviceBuffers();
+
+    void transferInputToDevice(TrtCudaStream& stream);
+
+    void transferOutputToHost(TrtCudaStream& stream);
+
+    void fill(int binding, std::string const& fileName)
+    {
+        mBindings[binding].fill(fileName);
+    }
+
+    void fill(int binding)
+    {
+        mBindings[binding].fill();
+    }
+
+    void dumpBindingDimensions(
+        std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const;
+
+    void dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os,
+        std::string const& separator = " ", int32_t batch = 1) const;
+
+    void dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const;
+
+    void dumpInputs(nvinfer1::IExecutionContext const& context, std::ostream& os) const
+    {
+        auto isInput = [](Binding const& b) { return b.isInput; };
+        dumpBindings(context, isInput, os);
+    }
+
+    void dumpOutputs(nvinfer1::IExecutionContext const& context, std::ostream& os) const;
+
+    void dumpBindings(nvinfer1::IExecutionContext const& context, std::ostream& os) const
+    {
+        auto all = [](Binding const& b) { return true; };
+        dumpBindings(context, all, os);
+    }
+
+    void dumpBindings(nvinfer1::IExecutionContext const& context, std::function<bool(Binding const&)> predicate,
+        std::ostream& os) const
+    {
+        for (auto const& n : mNames)
+        {
+            auto const name = n.first;
+            auto const binding = n.second;
+            if (predicate(mBindings[binding]))
+            {
+                os << n.first << ": (";
+                dumpBindingDimensions(name, context, os);
+                os << ")" << std::endl;
+
+                dumpBindingValues(context, binding, os);
+                os << std::endl;
+            }
+        }
+    }
+
+    std::unordered_map<std::string, int> getInputBindings() const
+    {
+        auto isInput = [](Binding const& b) { return b.isInput; };
+        return getBindings(isInput);
+    }
+
+    std::unordered_map<std::string, int> getOutputBindings() const
+    {
+        auto isOutput = [](Binding const& b) { return !b.isInput; };
+        return getBindings(isOutput);
+    }
+
+    std::unordered_map<std::string, int> getBindings() const
+    {
+        auto all = [](Binding const& b) { return true; };
+        return getBindings(all);
+    }
+
+    std::unordered_map<std::string, int> getBindings(std::function<bool(Binding const&)> predicate) const;
+
+    bool setTensorAddresses(nvinfer1::IExecutionContext& context) const;
+
+private:
+    std::unordered_map<std::string, int32_t> mNames;
+    std::vector<Binding> mBindings;
+    std::vector<void*> mDevicePointers;
+    bool mUseManaged{false};
+};
+
+struct TaskInferenceEnvironment
+{
+    TaskInferenceEnvironment(std::string engineFile, InferenceOptions inference, int32_t deviceId = 0,
+        int32_t DLACore = -1, int32_t bs = batchNotProvided);
+    InferenceOptions iOptions{};
+    int32_t device{defaultDevice};
+    int32_t batch{batchNotProvided};
+    std::unique_ptr<InferenceEnvironment> iEnv;
+    std::vector<InferenceTrace> trace;
+};
+
+bool runMultiTasksInference(std::vector<std::unique_ptr<TaskInferenceEnvironment>>& tEnvList);
+
+} // namespace sample
 
 #endif // TRT_SAMPLE_INFERENCE_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleOptions.cpp b/src/Detector/tensorrt_yolo/common/sampleOptions.cpp
index 0afd163f..bdb1b21c 100644
--- a/src/Detector/tensorrt_yolo/common/sampleOptions.cpp
+++ b/src/Detector/tensorrt_yolo/common/sampleOptions.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -19,6 +20,7 @@
 #include <cstring>
 #include <functional>
 #include <iostream>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -27,29 +29,64 @@
 
 #include "logger.h"
 #include "sampleOptions.h"
-
+#include "sampleUtils.h"
+using namespace nvinfer1;
 namespace sample
 {
 
 namespace
 {
 
-std::vector<std::string> splitToStringVec(const std::string& option, char separator)
+static const std::map<char, std::pair<int64_t, std::string>> kUNIT_MULTIPLIERS{
+    {'B', {1, "Bytes"}},
+    {'K', {1 << 10, "Kibibytes"}},
+    {'M', {1 << 20, "Mebibytes"}},
+    {'G', {1 << 30, "Gibibytes"}},
+};
+
+std::string addDefaultUnitSuffixIfNotSpecified(std::string const& option, char defaultUnit)
 {
-    std::vector<std::string> options;
+    char lastChar = option.at(option.size() - 1);
+    return std::isdigit(lastChar) ? option + defaultUnit : option;
+}
 
-    for (size_t start = 0; start < option.length();)
+// Returns "B (Bytes), K (Kilobytes), ..."
+std::string getAvailableUnitSuffixes()
+{
+    std::ostringstream ss;
+    for (auto it = kUNIT_MULTIPLIERS.begin(); it != kUNIT_MULTIPLIERS.end(); ++it)
     {
-        size_t separatorIndex = option.find(separator, start);
-        if (separatorIndex == std::string::npos)
+        if (it != kUNIT_MULTIPLIERS.begin())
         {
-            separatorIndex = option.length();
+            ss << ", ";
         }
-        options.emplace_back(option.substr(start, separatorIndex - start));
-        start = separatorIndex + 1;
+        ss << it->first << " (" << it->second.second << ")";
     }
+    return ss.str();
+}
 
-    return options;
+// Numeric trtexec arguments can have unit specifiers in similar to polygraphy.
+// E.g. --weightStreamingBudget=20M would be 20 Mebibytes (base 2).
+int64_t getUnitMultiplier(std::string const& option)
+{
+    char lastChar = option.at(option.size() - 1);
+    if (!std::isdigit(lastChar))
+    {
+        char unit = std::toupper(lastChar);
+        auto found = kUNIT_MULTIPLIERS.find(unit);
+        if (found == kUNIT_MULTIPLIERS.end())
+        {
+            std::ostringstream ss;
+            ss << "Error parsing \"" << option << "\": invalid unit specifier '" << unit
+               << "'. Valid base-2 unit suffixes include: ";
+            ss << getAvailableUnitSuffixes() << ".";
+            throw std::invalid_argument(ss.str());
+        }
+        return found->second.first;
+    }
+
+    // Return bytes by default
+    return kUNIT_MULTIPLIERS.at('B').first;
 }
 
 template <typename T>
@@ -64,6 +101,12 @@ int32_t stringToValue<int32_t>(const std::string& option)
     return std::stoi(option);
 }
 
+template <>
+size_t stringToValue<size_t>(const std::string& option)
+{
+    return std::stoi(option) * getUnitMultiplier(option);
+}
+
 template <>
 float stringToValue<float>(const std::string& option)
 {
@@ -73,7 +116,7 @@ float stringToValue<float>(const std::string& option)
 template <>
 double stringToValue<double>(const std::string& option)
 {
-    return std::stod(option);
+    return std::stod(option) * getUnitMultiplier(option);
 }
 
 template <>
@@ -86,6 +129,10 @@ template <>
 std::vector<int32_t> stringToValue<std::vector<int32_t>>(const std::string& option)
 {
     std::vector<int32_t> shape;
+    if (option == "scalar")
+    {
+        return shape;
+    }
     std::vector<std::string> dimsStrings = splitToStringVec(option, 'x');
     for (const auto& d : dimsStrings)
     {
@@ -98,8 +145,9 @@ template <>
 nvinfer1::DataType stringToValue<nvinfer1::DataType>(const std::string& option)
 {
     const std::unordered_map<std::string, nvinfer1::DataType> strToDT{{"fp32", nvinfer1::DataType::kFLOAT},
-        {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8},
-        {"int32", nvinfer1::DataType::kINT32}};
+        {"fp16", nvinfer1::DataType::kHALF}, {"bf16", nvinfer1::DataType::kBF16}, {"int8", nvinfer1::DataType::kINT8},
+        {"fp8", nvinfer1::DataType::kFP8}, {"int32", nvinfer1::DataType::kINT32}, {"int64", nvinfer1::DataType::kINT64},
+        {"bool", nvinfer1::DataType::kBOOL}, {"uint8", nvinfer1::DataType::kUINT8}, {"int4", nvinfer1::DataType::kINT4}};
     const auto& dt = strToDT.find(option);
     if (dt == strToDT.end())
     {
@@ -108,6 +156,21 @@ nvinfer1::DataType stringToValue<nvinfer1::DataType>(const std::string& option)
     return dt->second;
 }
 
+template <>
+nvinfer1::DeviceType stringToValue<nvinfer1::DeviceType>(std::string const& option)
+{
+    std::unordered_map<std::string, nvinfer1::DeviceType> const strToDevice = {
+        {"GPU", nvinfer1::DeviceType::kGPU},
+        {"DLA", nvinfer1::DeviceType::kDLA},
+    };
+    auto const& device = strToDevice.find(option);
+    if (device == strToDevice.end())
+    {
+        throw std::invalid_argument("Invalid Device Type " + option);
+    }
+    return device->second;
+}
+
 template <>
 nvinfer1::TensorFormats stringToValue<nvinfer1::TensorFormats>(const std::string& option)
 {
@@ -116,7 +179,8 @@ nvinfer1::TensorFormats stringToValue<nvinfer1::TensorFormats>(const std::string
         {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4},
         {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16},
         {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8},
-        {"hwc", nvinfer1::TensorFormat::kHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR},
+        {"cdhw32", nvinfer1::TensorFormat::kCDHW32}, {"hwc", nvinfer1::TensorFormat::kHWC},
+        {"dhwc", nvinfer1::TensorFormat::kDHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR},
         {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}};
     nvinfer1::TensorFormats formats{};
     for (auto f : optionStrings)
@@ -149,11 +213,82 @@ IOFormat stringToValue<IOFormat>(const std::string& option)
     return ioFormat;
 }
 
+template <>
+SparsityFlag stringToValue<SparsityFlag>(std::string const& option)
+{
+    std::unordered_map<std::string, SparsityFlag> const table{
+        {"disable", SparsityFlag::kDISABLE}, {"enable", SparsityFlag::kENABLE}, {"force", SparsityFlag::kFORCE}};
+    auto search = table.find(option);
+    if (search == table.end())
+    {
+        throw std::invalid_argument(std::string("Unknown sparsity mode: ") + option);
+    }
+    if (search->second == SparsityFlag::kFORCE)
+    {
+        sample::gLogWarning << "--sparsity=force has been deprecated. "
+                            << "Please use <polygraphy surgeon prune> to rewrite the weights to a sparsity pattern "
+                            << "and then run with --sparsity=enable" << std::endl;
+    }
+
+    return search->second;
+}
+
+template <>
+WeightStreamingBudget stringToValue<WeightStreamingBudget>(std::string const& option)
+{
+    WeightStreamingBudget budget;
+    if (option.find('%') != std::string::npos)
+    {
+        double percent = std::stod(option);
+        if (!(percent >= 0 && percent <= 100.0))
+        {
+            std::ostringstream err;
+            err << "The weight streaming percent must be between 0 and 100.";
+            throw std::invalid_argument(err.str());
+        }
+        budget.percent = percent;
+    }
+    else
+    {
+        double bytes = stringToValue<double>(option);
+        if (!(bytes == WeightStreamingBudget::kAUTOMATIC || bytes == WeightStreamingBudget::kDISABLE || bytes >= 0))
+        {
+            std::ostringstream err;
+            err << "The weight streaming budget must be " << WeightStreamingBudget::kDISABLE << ", "
+                << WeightStreamingBudget::kAUTOMATIC << ", or at least 0.";
+            throw std::invalid_argument(err.str());
+        }
+        budget.bytes = static_cast<int64_t>(bytes);
+    }
+    return budget;
+}
+
 template <typename T>
 std::pair<std::string, T> splitNameAndValue(const std::string& s)
 {
     std::string tensorName;
     std::string valueString;
+
+    // Support 'inputName':Path format for --loadInputs flag when dealing with Windows paths.
+    // i.e. 'inputName':c:\inputData
+    std::vector<std::string> quoteNameRange{ splitToStringVec(s, '\'') };
+    // splitToStringVec returns the entire string when delimiter is not found, so it's size is always at least 1
+    if (quoteNameRange.size() != 1)
+    {
+        if (quoteNameRange.size() != 3)
+        {
+            std::string errorMsg = std::string("Found invalid number of \'s when parsing ") + s +
+                std::string(". Expected: 2, received: ") + std::to_string(quoteNameRange.size() -1) +
+                ". Please ensure that a singular comma is used within each comma-separated key-value pair for options like --inputIOFormats, --optShapes, --optShapesCalib, --layerPrecisions, etc.";
+            throw std::invalid_argument(errorMsg);
+        }
+        // Everything before the second "'" is the name.
+        tensorName = quoteNameRange[0] + quoteNameRange[1];
+        // Path is the last string - ignoring leading ":" so slice it with [1:]
+        valueString = quoteNameRange[2].substr(1);
+        return std::pair<std::string, T>(tensorName, stringToValue<T>(valueString));
+    }
+
     // Split on the last :
     std::vector<std::string> nameRange{splitToStringVec(s, ':')};
     // Everything before the last : is the name
@@ -181,16 +316,71 @@ const char* boolToEnabled(bool enable)
     return enable ? "Enabled" : "Disabled";
 }
 
+//! A helper function similar to sep.join(list) in Python.
+template <typename T>
+std::string joinValuesToString(std::vector<T> const& list, std::string const& sep)
+{
+    std::ostringstream os;
+    for (int32_t i = 0, n = list.size(); i < n; ++i)
+    {
+        os << list[i];
+        if (i != n - 1)
+        {
+            os << sep;
+        }
+    }
+    return os.str();
+}
+
+template <typename T, size_t N>
+std::string joinValuesToString(std::array<T, N> const& list, std::string const& sep)
+{
+    return joinValuesToString(std::vector<T>(list.begin(), list.end()), sep);
+}
+
 //! Check if input option exists in input arguments.
-//! If it does: return its value, erase the argument and return true.
+//! If it does: set its value, and return true
 //! If it does not: return false.
 template <typename T>
-bool getAndDelOption(Arguments& arguments, const std::string& option, T& value)
+bool getOption(Arguments& arguments, const std::string& option, T& value)
 {
-    const auto match = arguments.find(option);
+    auto const match = arguments.find(option);
     if (match != arguments.end())
     {
-        value = stringToValue<T>(match->second);
+        value = stringToValue<T>(match->second.first);
+        return true;
+    }
+
+    return false;
+}
+
+//! Check if input option exists in input arguments.
+//! If it does: set its value, erase the argument and return true.
+//! If it does not: return false.
+template <typename T_>
+bool getAndDelOption(Arguments& arguments, const std::string& option, T_& value)
+{
+    bool found = getOption(arguments, option, value);
+    if (found)
+    {
+        const auto match = arguments.find(option);
+        arguments.erase(match);
+    }
+
+    return found;
+}
+
+//! Check if input option exists in input arguments.
+//! If it does: set its value and position, erase the argument and return true.
+//! If it does not: return false.
+template <typename T_>
+bool getAndDelOptionWithPosition(Arguments& arguments, std::string const& option, T_& value, int32_t& pos)
+{
+    auto const match = arguments.find(option);
+    if (match != arguments.end())
+    {
+        value = stringToValue<T_>(match->second.first);
+        pos = match->second.second;
         arguments.erase(match);
         return true;
     }
@@ -198,8 +388,31 @@ bool getAndDelOption(Arguments& arguments, const std::string& option, T& value)
     return false;
 }
 
+//! Check if input option exists in input arguments behind the position spcecified by pos.
+//! If it does: set its value, erase the argument and return true.
+//! If it does not: return false.
+template <typename T_>
+bool getAndDelOptionBehind(Arguments& arguments, std::string const& option, int32_t pos, T_& value)
+{
+    auto const match = arguments.equal_range(option);
+    if (match.first == match.second)
+    {
+        return false;
+    }
+    for (auto i = match.first; i != match.second; ++i)
+    {
+        if (i->second.second - pos == 1)
+        {
+            value = stringToValue<T_>(i->second.first);
+            arguments.erase(i);
+            return true;
+        }
+    }
+    return false;
+}
+
 //! Check if input option exists in input arguments.
-//! If it does: return false in value, erase the argument and return true.
+//! If it does: set false in value, erase the argument and return true.
 //! If it does not: return false.
 bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value)
 {
@@ -224,34 +437,37 @@ bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, st
         return false;
     }
 
-    auto addToValues = [&values](Arguments::value_type& argValue) {values.emplace_back(stringToValue<T>(argValue.second));};
+    auto addToValues
+        = [&values](Arguments::value_type& argValue) { values.emplace_back(stringToValue<T>(argValue.second.first)); };
     std::for_each(match.first, match.second, addToValues);
     arguments.erase(match.first, match.second);
 
     return true;
 }
 
-void insertShapesBuild(std::unordered_map<std::string, ShapeRange>& shapes, nvinfer1::OptProfileSelector selector, const std::string& name, const std::vector<int32_t>& dims)
+void insertShapesBuild(BuildOptions::ShapeProfile& shapes, nvinfer1::OptProfileSelector selector,
+    const std::string& name, const std::vector<int32_t>& dims)
 {
     shapes[name][static_cast<size_t>(selector)] = dims;
 }
 
-void insertShapesInference(std::unordered_map<std::string, std::vector<int32_t>>& shapes, const std::string& name, const std::vector<int32_t>& dims)
+void insertShapesInference(
+    InferenceOptions::ShapeProfile& shapes, std::string const& name, std::vector<int32_t> const& dims)
 {
     shapes[name] = dims;
 }
 
 std::string removeSingleQuotationMarks(std::string& str)
 {
-     std::vector<std::string> strList{splitToStringVec(str, '\'')};
-     // Remove all the escaped single quotation marks
-     std::string retVal = "";
-     // Do not really care about unterminated sequences
-     for (size_t i = 0; i < strList.size(); i++)
-     {
-         retVal += strList[i];
-     }
-     return retVal;
+    std::vector<std::string> strList{splitToStringVec(str, '\'')};
+    // Remove all the escaped single quotation marks
+    std::string retVal;
+    // Do not really care about unterminated sequences
+    for (size_t i = 0; i < strList.size(); i++)
+    {
+        retVal += strList[i];
+    }
+    return retVal;
 }
 
 void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions)
@@ -293,7 +509,41 @@ void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutput
     }
 }
 
-bool getShapesBuild(Arguments& arguments, std::unordered_map<std::string, ShapeRange>& shapes, char const* argument,
+void getLayerDeviceTypes(Arguments& arguments, char const* argument, LayerDeviceTypes& layerDeviceTypes)
+{
+    std::string list;
+    if (!getAndDelOption(arguments, argument, list))
+    {
+        return;
+    }
+
+    // The layerDeviceTypes flag contains comma-separated layerName:deviceType pairs.
+    std::vector<std::string> deviceList{splitToStringVec(list, ',')};
+    for (auto const& s : deviceList)
+    {
+        auto nameDevicePair = splitNameAndValue<std::string>(s);
+        auto const layerName = removeSingleQuotationMarks(nameDevicePair.first);
+        layerDeviceTypes[layerName] = stringToValue<nvinfer1::DeviceType>(nameDevicePair.second);
+    }
+}
+
+void getStringsSet(Arguments& arguments, char const* argument, StringSet& stringSet)
+{
+    std::string list;
+    if (!getAndDelOption(arguments, argument, list))
+    {
+        return;
+    }
+
+    // The layerPrecisions flag contains comma-separated layerName:precision pairs.
+    std::vector<std::string> strings{splitToStringVec(list, ',')};
+    for (auto const& s : strings)
+    {
+        stringSet.insert(s);
+    }
+}
+
+bool getShapesBuild(Arguments& arguments, BuildOptions::ShapeProfile& shapes, char const* argument,
     nvinfer1::OptProfileSelector selector)
 {
     std::string list;
@@ -309,7 +559,7 @@ bool getShapesBuild(Arguments& arguments, std::unordered_map<std::string, ShapeR
     return retVal;
 }
 
-bool getShapesInference(Arguments& arguments, std::unordered_map<std::string, std::vector<int32_t>>& shapes, const char* argument)
+bool getShapesInference(Arguments& arguments, InferenceOptions::ShapeProfile& shapes, const char* argument)
 {
     std::string list;
     bool retVal = getAndDelOption(arguments, argument, list);
@@ -324,67 +574,195 @@ bool getShapesInference(Arguments& arguments, std::unordered_map<std::string, st
     return retVal;
 }
 
-void processShapes(std::unordered_map<std::string, ShapeRange>& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib)
+void fillShapes(BuildOptions::ShapeProfile& shapes, std::string const& name, ShapeRange const& sourceShapeRange,
+    nvinfer1::OptProfileSelector minDimsSource, nvinfer1::OptProfileSelector optDimsSource,
+    nvinfer1::OptProfileSelector maxDimsSource)
 {
-    // Only accept optShapes only or all three of minShapes, optShapes, maxShapes
-    if ( ((minShapes || maxShapes) && !optShapes)  // minShapes only, maxShapes only, both minShapes and maxShapes
+    insertShapesBuild(
+        shapes, nvinfer1::OptProfileSelector::kMIN, name, sourceShapeRange[static_cast<size_t>(minDimsSource)]);
+    insertShapesBuild(
+        shapes, nvinfer1::OptProfileSelector::kOPT, name, sourceShapeRange[static_cast<size_t>(optDimsSource)]);
+    insertShapesBuild(
+        shapes, nvinfer1::OptProfileSelector::kMAX, name, sourceShapeRange[static_cast<size_t>(maxDimsSource)]);
+}
+
+void processShapes(BuildOptions::ShapeProfile& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib)
+{
+    // Only accept optShapes only or all three of minShapes, optShapes, maxShapes when calib is set
+    if (((minShapes || maxShapes) && !optShapes)   // minShapes only, maxShapes only, both minShapes and maxShapes
         || (minShapes && !maxShapes && optShapes)  // both minShapes and optShapes
         || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes
     {
         if (calib)
         {
-            throw std::invalid_argument("Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib");
-        }
-        else
-        {
-            throw std::invalid_argument("Must specify only --optShapes or all of --minShapes, --optShapes, --maxShapes");
+            throw std::invalid_argument(
+                "Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib");
         }
     }
 
-    // If optShapes only, expand optShapes to minShapes and maxShapes
-    if (optShapes && !minShapes && !maxShapes)
+    if (!minShapes && !optShapes && !maxShapes)
     {
-        std::unordered_map<std::string, ShapeRange> newShapes;
-        for (auto& s : shapes)
+        return;
+    }
+
+    BuildOptions::ShapeProfile newShapes;
+    for (auto& s : shapes)
+    {
+        nvinfer1::OptProfileSelector minDimsSource, optDimsSource, maxDimsSource;
+        minDimsSource = nvinfer1::OptProfileSelector::kMIN;
+        optDimsSource = nvinfer1::OptProfileSelector::kOPT;
+        maxDimsSource = nvinfer1::OptProfileSelector::kMAX;
+
+        // Populate missing minShapes
+        if (!minShapes)
+        {
+            if (optShapes)
+            {
+                minDimsSource = optDimsSource;
+                sample::gLogWarning << "optShapes is being broadcasted to minShapes for tensor " << s.first
+                                    << std::endl;
+            }
+            else
+            {
+                minDimsSource = maxDimsSource;
+                sample::gLogWarning << "maxShapes is being broadcasted to minShapes for tensor " << s.first
+                                    << std::endl;
+            }
+        }
+
+        // Populate missing optShapes
+        if (!optShapes)
         {
-            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
-            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
-            insertShapesBuild(newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+            if (maxShapes)
+            {
+                optDimsSource = maxDimsSource;
+                sample::gLogWarning << "maxShapes is being broadcasted to optShapes for tensor " << s.first
+                                    << std::endl;
+            }
+            else
+            {
+                optDimsSource = minDimsSource;
+                sample::gLogWarning << "minShapes is being broadcasted to optShapes for tensor " << s.first
+                                    << std::endl;
+            }
+        }
+
+        // Populate missing maxShapes
+        if (!maxShapes)
+        {
+            if (optShapes)
+            {
+                maxDimsSource = optDimsSource;
+                sample::gLogWarning << "optShapes is being broadcasted to maxShapes for tensor " << s.first
+                                    << std::endl;
+            }
+            else
+            {
+                maxDimsSource = minDimsSource;
+                sample::gLogWarning << "minShapes is being broadcasted to maxShapes for tensor " << s.first
+                                    << std::endl;
+            }
         }
-        shapes = newShapes;
+
+        fillShapes(newShapes, s.first, s.second, minDimsSource, optDimsSource, maxDimsSource);
     }
+    shapes = newShapes;
 }
 
-template <typename T>
-void printShapes(std::ostream& os, const char* phase, const T& shapes)
+bool getOptimizationProfiles(
+    Arguments& arguments, std::vector<BuildOptions::ShapeProfile>& optProfiles, char const* argument)
 {
-    if (shapes.empty())
+    bool retValue{false};
+    int32_t pos{};
+    size_t profileIndex{};
+
+    auto getShapes
+        = [](BuildOptions::ShapeProfile& shapes, std::string const& list, nvinfer1::OptProfileSelector selector) {
+              std::vector<std::string> shapeList{splitToStringVec(list, ',')};
+              for (auto const& s : shapeList)
+              {
+                  auto nameDimsPair = splitNameAndValue<std::vector<int32_t>>(s);
+                  auto tensorName = removeSingleQuotationMarks(nameDimsPair.first);
+                  auto dims = nameDimsPair.second;
+                  insertShapesBuild(shapes, selector, tensorName, dims);
+              }
+          };
+
+    while (getAndDelOptionWithPosition(arguments, argument, profileIndex, pos))
     {
-        os << "Input " << phase << " shapes: model" << std::endl;
+        BuildOptions::ShapeProfile optProfile{};
+        bool minShapes{false}, maxShapes{false}, optShapes{false};
+        for (int32_t i = 0; i < nvinfer1::EnumMax<nvinfer1::OptProfileSelector>(); i++, pos++)
+        {
+            std::string value;
+
+            if (!minShapes && getAndDelOptionBehind(arguments, "--minShapes", pos, value))
+            {
+                minShapes = true;
+                getShapes(optProfile, value, nvinfer1::OptProfileSelector::kMIN);
+            }
+            else if (!maxShapes && getAndDelOptionBehind(arguments, "--maxShapes", pos, value))
+            {
+                maxShapes = true;
+                getShapes(optProfile, value, nvinfer1::OptProfileSelector::kMAX);
+            }
+            else if (!optShapes && getAndDelOptionBehind(arguments, "--optShapes", pos, value))
+            {
+                optShapes = true;
+                getShapes(optProfile, value, nvinfer1::OptProfileSelector::kOPT);
+            }
+            else
+            {
+                break;
+            }
+        }
+        processShapes(optProfile, minShapes, optShapes, maxShapes, false);
+        if (profileIndex >= optProfiles.size())
+        {
+            optProfiles.resize(profileIndex + 1);
+        }
+        if (!optProfiles[profileIndex].empty())
+        {
+            throw std::invalid_argument("Optimization profile index cannot be the same.");
+        }
+        optProfiles[profileIndex] = optProfile;
+        retValue = true;
     }
-    else
+
+    profileIndex = 0;
+    for (auto const& optProfile : optProfiles)
     {
-        for (const auto& s : shapes)
+        if (optProfile.empty())
         {
-            os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl;
+            throw std::invalid_argument(std::string("Found invalid or missing shape spec at profile index ")
+                + std::to_string(profileIndex) + std::string(". "));
         }
+        ++profileIndex;
     }
+    return retValue;
 }
 
-std::ostream& printBatch(std::ostream& os, int32_t maxBatch)
+template <typename T>
+void printShapes(std::ostream& os, char const* phase, T const& shapes, int32_t profileIndex)
 {
-    if (maxBatch != maxBatchNotProvided)
+    if (shapes.empty())
     {
-        os << maxBatch;
+        os << "Input " << phase << " shapes: model" << std::endl;
     }
     else
     {
-        os << "explicit batch";
+        std::string profileString = (profileIndex != -1 && strcmp(phase, "build") == 0)
+            ? "(profile " + std::to_string(profileIndex) + ")"
+            : "";
+        for (auto const& s : shapes)
+        {
+            os << "Input " << phase << " shape " << profileString << ": " << s.first << "=" << s.second << std::endl;
+        }
     }
-    return os;
 }
 
-std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources)
+std::ostream& printTacticSources(
+    std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources)
 {
     if (!enabledSources && !disabledSources)
     {
@@ -405,24 +783,41 @@ std::ostream& printTacticSources(std::ostream& os, nvinfer1::TacticSources enabl
 
         addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS), "cublas");
         addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt");
-#if (NV_TENSORRT_MAJOR > 7)
         addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUDNN), "cudnn");
-#endif
+        addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS), "edge mask convolutions");
+        addSource(1U << static_cast<uint32_t>(nvinfer1::TacticSource::kJIT_CONVOLUTIONS), "JIT convolutions");
     }
     return os;
 }
 
 std::ostream& printPrecision(std::ostream& os, BuildOptions const& options)
 {
+    if (options.stronglyTyped)
+    {
+        os << "Strongly Typed";
+        return os;
+    }
     os << "FP32";
     if (options.fp16)
     {
         os << "+FP16";
     }
+    if (options.bf16)
+    {
+        os << "+BF16";
+    }
     if (options.int8)
     {
         os << "+INT8";
     }
+    if (options.fp8)
+    {
+        os << "+FP8";
+    }
+    if (options.int4)
+    {
+        os << "+INT4";
+    }
     if (options.precisionConstraints == PrecisionConstraints::kOBEY)
     {
         os << " (obey precision constraints)";
@@ -434,13 +829,27 @@ std::ostream& printPrecision(std::ostream& os, BuildOptions const& options)
     return os;
 }
 
-std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options)
+std::ostream& printTempfileControls(std::ostream& os, TempfileControlFlags const tempfileControls)
+{
+    auto getFlag = [&](TempfileControlFlag f) -> char const* {
+        bool allowed = !!(tempfileControls & (1U << static_cast<int64_t>(f)));
+        return allowed ? "allow" : "deny";
+    };
+    auto const inMemory = getFlag(TempfileControlFlag::kALLOW_IN_MEMORY_FILES);
+    auto const temporary = getFlag(TempfileControlFlag::kALLOW_TEMPORARY_FILES);
+
+    os << "{ in_memory: " << inMemory << ", temporary: " << temporary << " }";
+
+    return os;
+}
+
+std::ostream& printTimingCache(std::ostream& os, TimingCacheMode const& timingCacheMode)
 {
-    switch (options.timingCacheMode)
+    switch (timingCacheMode)
     {
-        case TimingCacheMode::kGLOBAL: os << "global"; break;
-        case TimingCacheMode::kLOCAL: os << "local"; break;
-        case TimingCacheMode::kDISABLE: os << "disable"; break;
+    case TimingCacheMode::kGLOBAL: os << "global"; break;
+    case TimingCacheMode::kLOCAL: os << "local"; break;
+    case TimingCacheMode::kDISABLE: os << "disable"; break;
     }
     return os;
 }
@@ -459,20 +868,67 @@ std::ostream& printSparsity(std::ostream& os, BuildOptions const& options)
 
 std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options)
 {
-    auto const printValueOrDefault = [&os](double const val) {
+    auto const printValueOrDefault = [&os](double const val, char const* unit = "MiB") {
         if (val >= 0)
         {
-            os << val << " MiB";
+            os << val << " " << unit;
         }
         else
         {
             os << "default";
         }
     };
-    os << "workspace: ";     printValueOrDefault(options.workspace);     os << ", ";
-    os << "dlaSRAM: ";       printValueOrDefault(options.dlaSRAM);       os << ", ";
-    os << "dlaLocalDRAM: ";  printValueOrDefault(options.dlaLocalDRAM);  os << ", ";
-    os << "dlaGlobalDRAM: "; printValueOrDefault(options.dlaGlobalDRAM);
+    os << "workspace: ";
+    printValueOrDefault(options.workspace);
+    os << ", ";
+    os << "dlaSRAM: ";
+    printValueOrDefault(options.dlaSRAM);
+    os << ", ";
+    os << "dlaLocalDRAM: ";
+    printValueOrDefault(options.dlaLocalDRAM);
+    os << ", ";
+    os << "dlaGlobalDRAM: ";
+    printValueOrDefault(options.dlaGlobalDRAM);
+    os << ", ";
+    os << "tacticSharedMem: ";
+    printValueOrDefault(options.tacticSharedMem, "KiB");
+    return os;
+}
+
+std::string previewFeatureToString(PreviewFeature feature)
+{
+    // clang-format off
+    switch (feature)
+    {
+    case PreviewFeature::kPROFILE_SHARING_0806:
+    {
+        gLogWarning << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect." << std::endl;
+        break;
+    }
+    case PreviewFeature::kALIASED_PLUGIN_IO_10_03: return "kALIASED_PLUGIN_IO_10_03";
+    }
+    return "Invalid Preview Feature";
+    // clang-format on
+}
+
+std::ostream& printPreviewFlags(std::ostream& os, BuildOptions const& options)
+{
+    if (options.previewFeatures.empty())
+    {
+        os << "Use default preview flags.";
+        return os;
+    }
+
+    auto const addFlag = [&](PreviewFeature feat) {
+        int32_t featVal = static_cast<int32_t>(feat);
+        if (options.previewFeatures.find(featVal) != options.previewFeatures.end())
+        {
+            os << previewFeatureToString(feat) << (options.previewFeatures.at(featVal) ? " [ON], " : " [OFF], ");
+        }
+    };
+
+    addFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03);
+
     return os;
 }
 
@@ -487,51 +943,41 @@ Arguments argsToArgumentsMap(int32_t argc, char* argv[])
         if (valuePtr)
         {
             std::string value{valuePtr + 1};
-            arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value);
+            arguments.emplace(std::string(argv[i], valuePtr - argv[i]), std::make_pair(value, i));
         }
         else
         {
-            arguments.emplace(argv[i], "");
+            arguments.emplace(argv[i], std::make_pair(std::string(""), i));
         }
     }
     return arguments;
 }
 
-void BaseModelOptions::parse(Arguments& arguments)
+namespace
 {
-    if (getAndDelOption(arguments, "--onnx", model))
-    {
-        format = ModelFormat::kONNX;
-    }
-    else if (getAndDelOption(arguments, "--uff", model))
-    {
-        format = ModelFormat::kUFF;
-    }
-    else if (getAndDelOption(arguments, "--model", model))
+std::string resolveHomeDirectoryOnLinux(std::string const& model)
+{
+    std::string filePath{model};
+#ifndef _WIN32
+    if (filePath[0] == '~')
     {
-        format = ModelFormat::kCAFFE;
+        char const* home = std::getenv("HOME");
+        if (home)
+        {
+            filePath.replace(0, 1, home);
+        }
     }
+#endif
+    return filePath;
 }
+} // namespace
 
-void UffInput::parse(Arguments& arguments)
+void BaseModelOptions::parse(Arguments& arguments)
 {
-    getAndDelOption(arguments, "--uffNHWC", NHWC);
-    std::vector<std::string> args;
-    if (getAndDelRepeatedOption(arguments, "--uffInput", args))
+    if (getAndDelOption(arguments, "--onnx", model))
     {
-        for (const auto& i : args)
-        {
-            std::vector<std::string> values{splitToStringVec(i, ',')};
-            if (values.size() == 4)
-            {
-                nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])};
-                inputs.emplace_back(values[0], dims);
-            }
-            else
-            {
-                throw std::invalid_argument(std::string("Invalid uffInput ") + i);
-            }
-        }
+        format = ModelFormat::kONNX;
+        model = resolveHomeDirectoryOnLinux(model);
     }
 }
 
@@ -541,56 +987,66 @@ void ModelOptions::parse(Arguments& arguments)
 
     switch (baseModel.format)
     {
-    case ModelFormat::kCAFFE:
+    case ModelFormat::kONNX:
+    case ModelFormat::kANY:
     {
-        getAndDelOption(arguments, "--deploy", prototxt);
         break;
     }
-    case ModelFormat::kUFF:
-    {
-        uffInputs.parse(arguments);
-        if (uffInputs.inputs.empty())
-        {
-            throw std::invalid_argument("Uff models require at least one input");
-        }
-        break;
     }
-    case ModelFormat::kONNX:
-        break;
-    case ModelFormat::kANY:
+
+    if (baseModel.format == ModelFormat::kONNX)
     {
-        if (getAndDelOption(arguments, "--deploy", prototxt))
+        if (!outputs.empty())
         {
-            baseModel.format = ModelFormat::kCAFFE;
+            throw std::invalid_argument("The --output flag should not be used with ONNX models.");
         }
-        break;
     }
+}
+
+void getTempfileControls(Arguments& arguments, char const* argument, TempfileControlFlags& tempfileControls)
+{
+    std::string list;
+    if (!getAndDelOption(arguments, argument, list))
+    {
+        return;
     }
 
-    // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX.
-    std::vector<std::string> outArgs;
-    if (getAndDelRepeatedOption(arguments, "--output", outArgs))
+    std::vector<std::string> controlList{splitToStringVec(list, ',')};
+    for (auto const& s : controlList)
     {
-        for (const auto& o : outArgs)
+        auto controlAllowPair = splitNameAndValue<std::string>(s);
+        bool allowed{false};
+        int32_t offset{-1};
+
+        if (controlAllowPair.second.compare("allow") == 0)
         {
-            for (auto& v : splitToStringVec(o, ','))
-            {
-                outputs.emplace_back(std::move(v));
-            }
+            allowed = true;
         }
-    }
-    if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF)
-    {
-        if (outputs.empty())
+        else if (controlAllowPair.second.compare("deny") != 0)
         {
-            throw std::invalid_argument("Caffe and Uff models require at least one output");
+            throw std::invalid_argument("--tempfileControls value should be `deny` or `allow`");
         }
-    }
-    else if (baseModel.format == ModelFormat::kONNX)
-    {
-        if (!outputs.empty())
+
+        if (controlAllowPair.first.compare("in_memory") == 0)
         {
-            throw std::invalid_argument("The --output flag should not be used with ONNX models.");
+            offset = static_cast<int32_t>(TempfileControlFlag::kALLOW_IN_MEMORY_FILES);
+        }
+        else if (controlAllowPair.first.compare("temporary") == 0)
+        {
+            offset = static_cast<int32_t>(TempfileControlFlag::kALLOW_TEMPORARY_FILES);
+        }
+        else
+        {
+            throw std::invalid_argument(std::string{"Unknown --tempfileControls key "} + controlAllowPair.first);
+        }
+
+        if (allowed)
+        {
+            tempfileControls |= (1U << offset);
+        }
+        else
+        {
+            tempfileControls &= ~(1U << offset);
         }
     }
 }
@@ -610,38 +1066,59 @@ void BuildOptions::parse(Arguments& arguments)
     getFormats(inputFormats, "--inputIOFormats");
     getFormats(outputFormats, "--outputIOFormats");
 
-    bool addedExplicitBatchFlag{false};
-    getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag);
-    if (addedExplicitBatchFlag)
-    {
-        sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl;
-        sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic "
-                            << "shapes are provided when the engine is built." << std::endl;
-    }
-
-    bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN);
-    bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT);
-    bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX);
-    processShapes(shapes, minShapes, optShapes, maxShapes, false);
-    bool minShapesCalib
-        = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN);
-    bool optShapesCalib
-        = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT);
-    bool maxShapesCalib
-        = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX);
-    processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true);
+    bool getCalibProfile = getAndDelOption(arguments, "--calibProfile", calibProfile);
+    if (!getOptimizationProfiles(arguments, optProfiles, "--profile"))
+    {
+        ShapeProfile shapes;
+        bool minShapes{false}, optShapes{false}, maxShapes{false};
+        try
+        {
+            minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN);
+            optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT);
+            maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX);
+        }
+        catch (std::invalid_argument const& arg)
+        {
+            throw std::invalid_argument(arg.what()
+                + std::string(" conversion failure: failed to parse minShapes/optShapes/maxShapes. Please double check "
+                              "your input string."));
+        }
 
-    bool addedExplicitPrecisionFlag{false};
-    getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag);
-    if (addedExplicitPrecisionFlag)
+        processShapes(shapes, minShapes, optShapes, maxShapes, false);
+        optProfiles.emplace_back(shapes);
+    }
+
+    if (calibProfile >= optProfiles.size())
+    {
+        throw std::invalid_argument(
+            std::string("--calibProfile shouldn't greater than the size of optimization profile."));
+    }
+
+    BuildOptions::ShapeProfile dummyShapes;
+
+    bool remainingMinShapes = getShapesBuild(arguments, dummyShapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN);
+    bool remainingOptShapes = getShapesBuild(arguments, dummyShapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT);
+    bool remainingMaxShapes = getShapesBuild(arguments, dummyShapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX);
+    if (remainingMinShapes || remainingOptShapes || remainingMaxShapes)
     {
-        sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl;
+        throw std::invalid_argument("Multiple --minShapes/--optShapes/--maxShapes without --profile are not allowed. ");
     }
 
-    if (getAndDelOption(arguments, "--workspace", workspace))
+    bool minShapesCalib{false}, optShapesCalib{false}, maxShapesCalib{false};
+    try
     {
-        sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl;
+        minShapesCalib = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN);
+        optShapesCalib = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT);
+        maxShapesCalib = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX);
     }
+    catch (std::invalid_argument const& arg)
+    {
+        throw std::invalid_argument(arg.what()
+            + std::string(" conversion failure: failed to parse minShapesCalib/optShapesCalib/maxShapesCalib. Please "
+                          "double check your input string."));
+    }
+
+    processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true);
 
     std::string memPoolSizes;
     getAndDelOption(arguments, "--memPoolSize", memPoolSizes);
@@ -650,26 +1127,47 @@ void BuildOptions::parse(Arguments& arguments)
     {
         std::string memPoolName;
         double memPoolSize;
-        std::tie(memPoolName, memPoolSize) = splitNameAndValue<double>(memPoolSpec);
+        try
+        {
+            std::string strPoolSize;
+            std::tie(memPoolName, strPoolSize) = splitNameAndValue<std::string>(memPoolSpec);
+            memPoolSize = stringToValue<double>(addDefaultUnitSuffixIfNotSpecified(strPoolSize, 'M'));
+        }
+        catch (std::invalid_argument const& arg)
+        {
+            throw std::invalid_argument(arg.what()
+                + std::string(
+                      " conversion failure: failed to parse --memPoolSize. Please double check your input string."));
+        }
+
         if (memPoolSize < 0)
         {
             throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize));
         }
         if (memPoolName == "workspace")
         {
-            workspace = memPoolSize;
+            // use unit in MB.
+            workspace = memPoolSize / 1.0_MiB;
         }
         else if (memPoolName == "dlaSRAM")
         {
-            dlaSRAM = memPoolSize;
+            // use unit in MB.
+            dlaSRAM = memPoolSize / 1.0_MiB;
         }
         else if (memPoolName == "dlaLocalDRAM")
         {
-            dlaLocalDRAM = memPoolSize;
+            // use unit in MB.
+            dlaLocalDRAM = memPoolSize / 1.0_MiB;
         }
         else if (memPoolName == "dlaGlobalDRAM")
         {
-            dlaGlobalDRAM = memPoolSize;
+            // use unit in MB.
+            dlaGlobalDRAM = memPoolSize / 1.0_MiB;
+        }
+        else if (memPoolName == "tacticSharedMem")
+        {
+            // use unit in KB.
+            tacticSharedMem = memPoolSize / 1.0_KiB;
         }
         else if (!memPoolName.empty())
         {
@@ -677,8 +1175,6 @@ void BuildOptions::parse(Arguments& arguments)
         }
     }
 
-    getAndDelOption(arguments, "--maxBatch", maxBatch);
-    getAndDelOption(arguments, "--minTiming", minTiming);
     getAndDelOption(arguments, "--avgTiming", avgTiming);
 
     bool best{false};
@@ -687,16 +1183,79 @@ void BuildOptions::parse(Arguments& arguments)
     {
         int8 = true;
         fp16 = true;
+
+        // BF16 only supported on Ampere+
+        if (samplesCommon::getSMVersion() >= 0x0800)
+        {
+            bf16 = true;
+        }
     }
 
     getAndDelOption(arguments, "--refit", refittable);
+
+    getAndDelOption(arguments, "--weightless", stripWeights);
+    getAndDelOption(arguments, "--stripWeights", stripWeights);
+
+    bool stripAllWeights{};
+    getAndDelOption(arguments, "--stripAllWeights", stripAllWeights);
+    if (stripAllWeights)
+    {
+        refittable = true;
+        stripWeights = true;
+    }
+
+    // --vc and --versionCompatible are synonyms
+    getAndDelOption(arguments, "--vc", versionCompatible);
+    if (!versionCompatible)
+    {
+        getAndDelOption(arguments, "--versionCompatible", versionCompatible);
+    }
+
+#if !TRT_WINML
+    // --pi and --pluginInstanceNorm are synonyms
+    getAndDelOption(arguments, "--pi", pluginInstanceNorm);
+    if (!pluginInstanceNorm)
+    {
+        getAndDelOption(arguments, "--pluginInstanceNorm", pluginInstanceNorm);
+    }
+#endif
+
+    getAndDelOption(arguments, "--excludeLeanRuntime", excludeLeanRuntime);
+    getAndDelOption(arguments, "--noCompilationCache", disableCompilationCache);
     getAndDelNegOption(arguments, "--noTF32", tf32);
     getAndDelOption(arguments, "--fp16", fp16);
+    getAndDelOption(arguments, "--bf16", bf16);
     getAndDelOption(arguments, "--int8", int8);
+    getAndDelOption(arguments, "--fp8", fp8);
+    getAndDelOption(arguments, "--int4", int4);
+    getAndDelOption(arguments, "--stronglyTyped", stronglyTyped);
+    if (stronglyTyped)
+    {
+        auto disableAndLog = [](bool& flag, std::string mode, std::string type) {
+            if (flag)
+            {
+                flag = false;
+                sample::gLogWarning << "Invalid usage, setting " << mode
+                                    << " mode is not allowed if graph is strongly typed. Disabling BuilderFlag::"
+                                    << type << "." << std::endl;
+            }
+        };
+        disableAndLog(fp16, "fp16", "kFP16");
+        disableAndLog(int8, "int8", "kINT8");
+        disableAndLog(bf16, "bf16", "kBF16");
+        disableAndLog(fp8, "fp8", "kFP8");
+        disableAndLog(int4, "int4", "kINT4");
+    }
+
+    if (fp8 && int8)
+    {
+        throw std::invalid_argument("Invalid usage, fp8 and int8 aren't allowed to be enabled together.");
+    }
     getAndDelOption(arguments, "--safe", safe);
-    getAndDelOption(arguments, "--consistency", consistency);
+    getAndDelOption(arguments, "--buildDLAStandalone", buildDLAStandalone);
+    getAndDelOption(arguments, "--allowGPUFallback", allowGPUFallback);
     getAndDelOption(arguments, "--restricted", restricted);
-
+    getAndDelOption(arguments, "--skipInference", skipInference);
     getAndDelOption(arguments, "--directIO", directIO);
 
     std::string precisionConstraintsString;
@@ -720,10 +1279,11 @@ void BuildOptions::parse(Arguments& arguments)
 
     getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions);
     getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes);
+    getLayerDeviceTypes(arguments, "--layerDeviceTypes", layerDeviceTypes);
 
     if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE)
     {
-        sample::gLogWarning << "When --precisionConstraints flag is set to \"obey\" or \"prefer\", please add "
+        sample::gLogWarning << R"(When --precisionConstraints flag is set to "obey" or "prefer", please add )"
                             << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output "
                             << "types." << std::endl;
     }
@@ -731,79 +1291,52 @@ void BuildOptions::parse(Arguments& arguments)
         && precisionConstraints == PrecisionConstraints::kNONE)
     {
         sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints "
-                            << "flag is set to \"none\"." << std::endl;
+                            << R"(flag is set to "none".)" << std::endl;
     }
 
-    std::string sparsityString;
-    getAndDelOption(arguments, "--sparsity", sparsityString);
-    if (sparsityString == "disable")
-    {
-        sparsity = SparsityFlag::kDISABLE;
-    }
-    else if (sparsityString == "enable")
-    {
-        sparsity = SparsityFlag::kENABLE;
-    }
-    else if (sparsityString == "force")
-    {
-        sparsity = SparsityFlag::kFORCE;
-    }
-    else if (!sparsityString.empty())
-    {
-        throw std::invalid_argument(std::string("Unknown sparsity mode: ") + sparsityString);
-    }
+    getStringsSet(arguments, "--markDebug", debugTensors);
+
+    getAndDelOption(arguments, "--sparsity", sparsity);
 
     bool calibCheck = getAndDelOption(arguments, "--calib", calibration);
-    if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty())
+    if (int8 && calibCheck && !optProfiles[calibProfile].empty() && shapesCalib.empty())
     {
-        shapesCalib = shapes;
+        shapesCalib = optProfiles[calibProfile];
     }
-
-    std::string profilingVerbosityString;
-    if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString))
+    else if (!shapesCalib.empty() && getCalibProfile)
     {
-        sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl;
+        sample::gLogWarning
+            << "--calibProfile have no effect when --minShapesCalib/--optShapesCalib/--maxShapesCalib is set."
+            << std::endl;
     }
 
+    std::string profilingVerbosityString;
+
     getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString);
     if (profilingVerbosityString == "layer_names_only")
     {
-#if (NV_TENSORRT_MAJOR > 7)
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY;
-#else
-		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
-#endif
     }
     else if (profilingVerbosityString == "none")
     {
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE;
     }
-#if (NV_TENSORRT_MAJOR > 7)
     else if (profilingVerbosityString == "detailed")
     {
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED;
     }
-#endif
     else if (profilingVerbosityString == "default")
     {
-#if (NV_TENSORRT_MAJOR > 7)
         sample::gLogWarning << "--profilingVerbosity=default has been deprecated by "
                                "--profilingVerbosity=layer_names_only."
                             << std::endl;
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY;
-#else
-		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
-#endif
     }
     else if (profilingVerbosityString == "verbose")
     {
-#if (NV_TENSORRT_MAJOR > 7)
         sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed."
                             << std::endl;
         profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED;
-#else
-		profilingVerbosity = nvinfer1::ProfilingVerbosity::kDEFAULT;
-#endif
     }
     else if (!profilingVerbosityString.empty())
     {
@@ -814,6 +1347,8 @@ void BuildOptions::parse(Arguments& arguments)
     {
         load = true;
     }
+    getAndDelOption(arguments, "--getPlanVersionOnly", getPlanVersionOnly);
+
     if (getAndDelOption(arguments, "--saveEngine", engine))
     {
         save = true;
@@ -858,12 +1393,18 @@ void BuildOptions::parse(Arguments& arguments)
             {
                 source = nvinfer1::TacticSource::kCUBLAS_LT;
             }
-#if (NV_TENSORRT_MAJOR > 7)
             else if (t == "CUDNN")
             {
                 source = nvinfer1::TacticSource::kCUDNN;
             }
-#endif
+            else if (t == "EDGE_MASK_CONVOLUTIONS")
+            {
+                source = nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS;
+            }
+            else if (t == "JIT_CONVOLUTIONS")
+            {
+                source = nvinfer1::TacticSource::kJIT_CONVOLUTIONS;
+            }
             else
             {
                 throw std::invalid_argument(std::string("Unknown tactic source: ") + t);
@@ -887,38 +1428,179 @@ void BuildOptions::parse(Arguments& arguments)
         }
     }
 
-    bool noBuilderCache{false};
-    getAndDelOption(arguments, "--noBuilderCache", noBuilderCache);
-    getAndDelOption(arguments, "--timingCacheFile", timingCacheFile);
-    if (noBuilderCache)
+    bool noBuilderCache{false};
+    getAndDelOption(arguments, "--noBuilderCache", noBuilderCache);
+    getAndDelOption(arguments, "--timingCacheFile", timingCacheFile);
+    if (noBuilderCache)
+    {
+        timingCacheMode = TimingCacheMode::kDISABLE;
+    }
+    else if (!timingCacheFile.empty())
+    {
+        timingCacheMode = TimingCacheMode::kGLOBAL;
+    }
+    else
+    {
+        timingCacheMode = TimingCacheMode::kLOCAL;
+    }
+    getAndDelOption(arguments, "--errorOnTimingCacheMiss", errorOnTimingCacheMiss);
+    getAndDelOption(arguments, "--builderOptimizationLevel", builderOptimizationLevel);
+    getAndDelOption(arguments, "--maxTactics", maxTactics);
+
+    std::string runtimePlatformArgs;
+    getAndDelOption(arguments, "--runtimePlatform", runtimePlatformArgs);
+    if (runtimePlatformArgs == "SameAsBuild" || runtimePlatformArgs.empty())
+    {
+        runtimePlatform = RuntimePlatform::kSAME_AS_BUILD;
+    }
+    else if (runtimePlatformArgs == "WindowsAMD64")
+    {
+        runtimePlatform = RuntimePlatform::kWINDOWS_AMD64;
+    }
+    else
+    {
+        throw std::invalid_argument(std::string("Unknown runtime platform: ") + runtimePlatformArgs
+            + ". Valid options: SameAsBuild, WindowsAMD64.");
+    }
+
+    std::string hardwareCompatibleArgs;
+    getAndDelOption(arguments, "--hardwareCompatibilityLevel", hardwareCompatibleArgs);
+    if (hardwareCompatibleArgs == "none" || hardwareCompatibleArgs.empty())
+    {
+        hardwareCompatibilityLevel = HardwareCompatibilityLevel::kNONE;
+    }
+    else if (samplesCommon::toLower(hardwareCompatibleArgs) == "ampere+")
+    {
+        hardwareCompatibilityLevel = HardwareCompatibilityLevel::kAMPERE_PLUS;
+    }
+    else
+    {
+        throw std::invalid_argument(std::string("Unknown hardwareCompatibilityLevel: ") + hardwareCompatibleArgs
+            + ". Valid options: none, ampere+.");
+    }
+
+    if (pluginInstanceNorm && (versionCompatible || hardwareCompatibilityLevel == HardwareCompatibilityLevel::kAMPERE_PLUS))
+    {
+        throw std::invalid_argument("Plugin InstanceNorm cannot be used with version compatible or hardware compatible engines!");
+    }
+
+    getAndDelOption(arguments, "--maxAuxStreams", maxAuxStreams);
+
+    std::string previewFeaturesBuf;
+    getAndDelOption(arguments, "--preview", previewFeaturesBuf);
+    std::vector<std::string> previewFeaturesVec{splitToStringVec(previewFeaturesBuf, ',')};
+    for (auto featureName : previewFeaturesVec)
+    {
+        bool enable{false};
+        if (featureName.front() == '+')
+        {
+            enable = true;
+        }
+        else if (featureName.front() != '-')
+        {
+            throw std::invalid_argument(
+                "Preview features must be prefixed with + or -, indicating whether it should be enabled or disabled "
+                "respectively.");
+        }
+        featureName.erase(0, 1);
+
+        PreviewFeature feat{};
+        if (featureName == "profileSharing0806")
+        {
+            sample::gLogWarning
+                << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect."
+                << std::endl;
+        }
+        else if (featureName == "aliasedPluginIO1003")
+        {
+            feat = PreviewFeature::kALIASED_PLUGIN_IO_10_03;
+        }
+        else
+        {
+            throw std::invalid_argument(std::string("Unknown preview feature: ") + featureName);
+        }
+        previewFeatures[static_cast<int32_t>(feat)] = enable;
+    }
+
+    getAndDelOption(arguments, "--tempdir", tempdir);
+    getTempfileControls(arguments, "--tempfileControls", tempfileControls);
+
+    std::string runtimeMode;
+    getAndDelOption(arguments, "--useRuntime", runtimeMode);
+    if (runtimeMode == "full")
     {
-        timingCacheMode = TimingCacheMode::kDISABLE;
+        useRuntime = RuntimeMode::kFULL;
     }
-    else if (!timingCacheFile.empty())
+    else if (runtimeMode == "dispatch")
     {
-        timingCacheMode = TimingCacheMode::kGLOBAL;
+        useRuntime = RuntimeMode::kDISPATCH;
     }
-    else
+    else if (runtimeMode == "lean")
     {
-        timingCacheMode = TimingCacheMode::kLOCAL;
+        useRuntime = RuntimeMode::kLEAN;
+    }
+    else if (!runtimeMode.empty())
+    {
+        throw std::invalid_argument(std::string("Unknown useRuntime: ") + runtimeMode);
     }
+
+    if ((useRuntime == RuntimeMode::kDISPATCH || useRuntime == RuntimeMode::kLEAN) && !versionCompatible)
+    {
+        versionCompatible = true;
+        sample::gLogWarning << "Implicitly enabling --versionCompatible since --useRuntime=" << runtimeMode
+                            << " is set." << std::endl;
+    }
+
+    if (useRuntime != RuntimeMode::kFULL && !load)
+    {
+        throw std::invalid_argument(std::string("Building a TensorRT engine requires --useRuntime=full."));
+    }
+
+    getAndDelOption(arguments, "--leanDLLPath", leanDLLPath);
+
+    // Don't delete the option because the inference option parser requires it
+    getOption(arguments, "--allowWeightStreaming", allowWeightStreaming);
 }
 
 void SystemOptions::parse(Arguments& arguments)
 {
     getAndDelOption(arguments, "--device", device);
     getAndDelOption(arguments, "--useDLACore", DLACore);
-    getAndDelOption(arguments, "--allowGPUFallback", fallback);
+#if !TRT_WINML
     std::string pluginName;
     while (getAndDelOption(arguments, "--plugins", pluginName))
     {
+        sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl;
         plugins.emplace_back(pluginName);
     }
+    while (getAndDelOption(arguments, "--staticPlugins", pluginName))
+    {
+        plugins.emplace_back(pluginName);
+    }
+    while (getAndDelOption(arguments, "--setPluginsToSerialize", pluginName))
+    {
+        setPluginsToSerialize.emplace_back(pluginName);
+    }
+    while (getAndDelOption(arguments, "--dynamicPlugins", pluginName))
+    {
+        dynamicPlugins.emplace_back(pluginName);
+    }
+    getAndDelOption(arguments, "--ignoreParsedPluginLibs", ignoreParsedPluginLibs);
+#endif
 }
 
+constexpr int64_t WeightStreamingBudget::kDISABLE;
+constexpr int64_t WeightStreamingBudget::kAUTOMATIC;
+
 void InferenceOptions::parse(Arguments& arguments)
 {
-    getAndDelOption(arguments, "--streams", streams);
+
+    if (getAndDelOption(arguments, "--streams", infStreams))
+    {
+        sample::gLogWarning << "--streams flag has been deprecated, use --infStreams flag instead." << std::endl;
+    }
+    getAndDelOption(arguments, "--infStreams", infStreams);
+
     getAndDelOption(arguments, "--iterations", iterations);
     getAndDelOption(arguments, "--duration", duration);
     getAndDelOption(arguments, "--warmUp", warmup);
@@ -935,9 +1617,9 @@ void InferenceOptions::parse(Arguments& arguments)
     getAndDelOption(arguments, "--threads", threads);
     getAndDelOption(arguments, "--useCudaGraph", graph);
     getAndDelOption(arguments, "--separateProfileRun", rerun);
-    getAndDelOption(arguments, "--buildOnly", skip);
     getAndDelOption(arguments, "--timeDeserialize", timeDeserialize);
     getAndDelOption(arguments, "--timeRefit", timeRefit);
+    getAndDelOption(arguments, "--persistentCacheRatio", persistentCacheRatio);
 
     std::string list;
     getAndDelOption(arguments, "--loadInputs", list);
@@ -945,25 +1627,81 @@ void InferenceOptions::parse(Arguments& arguments)
     splitInsertKeyValue(inputsList, inputs);
 
     getShapesInference(arguments, shapes, "--shapes");
-    getAndDelOption(arguments, "--batch", batch);
+    setOptProfile = getAndDelOption(arguments, "--useProfile", optProfileIndex);
+
+    std::string allocationStrategyString;
+    getAndDelOption(arguments, "--allocationStrategy", allocationStrategyString);
+    if (allocationStrategyString == "static")
+    {
+        memoryAllocationStrategy = MemoryAllocationStrategy::kSTATIC;
+    }
+    else if (allocationStrategyString == "profile")
+    {
+        memoryAllocationStrategy = MemoryAllocationStrategy::kPROFILE;
+    }
+    else if (allocationStrategyString == "runtime")
+    {
+        memoryAllocationStrategy = MemoryAllocationStrategy::kRUNTIME;
+    }
+    else if (!allocationStrategyString.empty())
+    {
+        throw std::invalid_argument(std::string("Unknown allocationStrategy: ") + allocationStrategyString);
+    }
+
+    bool allowWs{false};
+    getAndDelOption(arguments, "--allowWeightStreaming", allowWs);
+    bool wsBudgetFound = getAndDelOption(arguments, "--weightStreamingBudget", weightStreamingBudget);
+    if (wsBudgetFound && !allowWs)
+    {
+        throw std::invalid_argument(
+            "The weight streaming budget can only be set with --allowWeightStreaming specified.");
+    }
+    if (allowWs && weightStreamingBudget.isDisabled())
+    {
+        sample::gLogWarning << "The engine can stream its weights but it will not at runtime because "
+                               "--weightStreamingBudget unset or set to "
+                            << WeightStreamingBudget::kDISABLE << "." << std::endl;
+    }
+
+    std::string debugTensorList;
+    getAndDelOption(arguments, "--saveDebugTensors", debugTensorList);
+    std::vector<std::string> fileNames{splitToStringVec(debugTensorList, ',')};
+    splitInsertKeyValue(fileNames, debugTensorFileNames);
 }
 
 void ReportingOptions::parse(Arguments& arguments)
 {
-    getAndDelOption(arguments, "--percentile", percentile);
     getAndDelOption(arguments, "--avgRuns", avgs);
     getAndDelOption(arguments, "--verbose", verbose);
     getAndDelOption(arguments, "--dumpRefit", refit);
     getAndDelOption(arguments, "--dumpOutput", output);
+    getAndDelOption(arguments, "--dumpRawBindingsToFile", dumpRawBindings);
     getAndDelOption(arguments, "--dumpProfile", profile);
     getAndDelOption(arguments, "--dumpLayerInfo", layerInfo);
+    getAndDelOption(arguments, "--dumpOptimizationProfile", optProfileInfo);
     getAndDelOption(arguments, "--exportTimes", exportTimes);
     getAndDelOption(arguments, "--exportOutput", exportOutput);
     getAndDelOption(arguments, "--exportProfile", exportProfile);
     getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo);
-    if (percentile < 0 || percentile > 100)
+
+    std::string percentileString;
+    getAndDelOption(arguments, "--percentile", percentileString);
+    std::vector<std::string> percentileStrings = splitToStringVec(percentileString, ',');
+    if (!percentileStrings.empty())
+    {
+        percentiles.clear();
+    }
+    for (const auto& p : percentileStrings)
     {
-        throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]");
+        percentiles.push_back(stringToValue<float>(p));
+    }
+
+    for (auto percentile : percentiles)
+    {
+        if (percentile < 0.F || percentile > 100.F)
+        {
+            throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]");
+        }
     }
 }
 
@@ -983,61 +1721,40 @@ void AllOptions::parse(Arguments& arguments)
     system.parse(arguments);
     inference.parse(arguments);
 
-    // Use explicitBatch when input model is ONNX or when dynamic shapes are used.
-    const bool isOnnx{model.baseModel.format == ModelFormat::kONNX};
-    const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()};
-    const bool detectedExplicitBatch = isOnnx || hasDynamicShapes;
-
-    // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim.
-    const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided};
-    const bool batchWasSet{inference.batch != batchNotProvided};
-    if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet))
+    if (build.useRuntime != RuntimeMode::kFULL && inference.timeRefit)
     {
-        throw std::invalid_argument(
-            "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes "
-            "are provided. Please use --optShapes and --shapes to set input shapes instead.");
+        throw std::invalid_argument("--timeRefit requires --useRuntime=full.");
     }
 
-    // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values.
-    if (!detectedExplicitBatch)
+    if (inference.optProfileIndex < static_cast<int32_t>(build.optProfiles.size()))
     {
-        // If batch is not set, set it to default value.
-        if (!batchWasSet)
-        {
-            inference.batch = defaultBatch;
-        }
-        // If maxBatch is not set, set it to be equal to batch.
-        if (!maxBatchWasSet)
+        // Propagate shape profile between builder and inference
+        for (auto const& s : build.optProfiles[inference.optProfileIndex])
         {
-            build.maxBatch = inference.batch;
+            if (inference.shapes.find(s.first) == inference.shapes.end())
+            {
+                insertShapesInference(
+                    inference.shapes, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
+            }
         }
-        // MaxBatch should not be less than batch.
-        if (build.maxBatch < inference.batch)
+        for (auto const& s : inference.shapes)
         {
-            throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch)
-                + " is less than inference batch " + std::to_string(inference.batch));
+            if (build.optProfiles[inference.optProfileIndex].find(s.first)
+                == build.optProfiles[inference.optProfileIndex].end())
+            {
+                // assume min/opt/max all the same
+                insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kMIN,
+                    s.first, s.second);
+                insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kOPT,
+                    s.first, s.second);
+                insertShapesBuild(build.optProfiles[inference.optProfileIndex], nvinfer1::OptProfileSelector::kMAX,
+                    s.first, s.second);
+            }
         }
     }
 
-    if (build.shapes.empty() && !inference.shapes.empty())
-    {
-        // If --shapes are provided but --optShapes are not, assume that optShapes is the same as shapes.
-        for (auto& s : inference.shapes)
-        {
-            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second);
-            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second);
-            insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second);
-        }
-    }
-    else if (!build.shapes.empty() && inference.shapes.empty())
-    {
-        // If --optShapes are provided but --shapes are not, assume that shapes is the same as optShapes.
-        for (auto& s : build.shapes)
-        {
-            insertShapesInference(
-                inference.shapes, s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]);
-        }
-    }
+    // Set nvtxVerbosity to be the same as build-time profilingVerbosity.
+    inference.nvtxVerbosity = build.profilingVerbosity;
 
     reporting.parse(arguments);
     helps = parseHelp(arguments);
@@ -1050,31 +1767,56 @@ void AllOptions::parse(Arguments& arguments)
         }
         if (build.safe && system.DLACore >= 0)
         {
-            auto checkSafeDLAFormats = [](std::vector<IOFormat> const& fmt) {
-                return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) {
+            build.buildDLAStandalone = true;
+        }
+        if (build.runtimePlatform != nvinfer1::RuntimePlatform::kSAME_AS_BUILD)
+        {
+            build.skipInference = true;
+        }
+        if (build.buildDLAStandalone)
+        {
+            build.skipInference = true;
+            auto checkSafeDLAFormats = [](std::vector<IOFormat> const& fmt, bool isInput) {
+                return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [&](IOFormat const& pair) {
                     bool supported{false};
-                    bool const isLINEAR{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kLINEAR)};
-                    bool const isCHW4{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW4)};
+                    bool const isDLA_LINEAR{
+                        pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kDLA_LINEAR)};
+                    bool const isHWC4{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW4)
+                        || pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kDLA_HWC4)};
                     bool const isCHW32{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW32)};
                     bool const isCHW16{pair.second == 1U << static_cast<int32_t>(nvinfer1::TensorFormat::kCHW16)};
-                    supported |= pair.first == nvinfer1::DataType::kINT8 && (isLINEAR || isCHW4 || isCHW32);
-                    supported |= pair.first == nvinfer1::DataType::kHALF && (isLINEAR || isCHW4 || isCHW16);
+                    supported |= pair.first == nvinfer1::DataType::kINT8
+                        && (isDLA_LINEAR || (isInput ? isHWC4 : false) || isCHW32);
+                    supported |= pair.first == nvinfer1::DataType::kHALF
+                        && (isDLA_LINEAR || (isInput ? isHWC4 : false) || isCHW16);
                     return supported;
                 });
             };
-            if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats))
+            if (!checkSafeDLAFormats(build.inputFormats, true) || !checkSafeDLAFormats(build.outputFormats, false))
             {
                 throw std::invalid_argument(
-                    "I/O formats for safe DLA capability are restricted to fp16/int8:linear, fp16:chw16 or int8:chw32");
+                    "I/O formats for safe DLA capability are restricted to fp16/int8:dla_linear, fp16/int8:hwc4, "
+                    "fp16:chw16 or "
+                    "int8:chw32");
             }
-            if (system.fallback)
+            if (build.allowGPUFallback)
             {
-                throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability");
+                throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for DLA standalone mode");
             }
         }
     }
 }
 
+void TaskInferenceOptions::parse(Arguments& arguments)
+{
+    getAndDelOption(arguments, "engine", engine);
+    getAndDelOption(arguments, "device", device);
+    getAndDelOption(arguments, "batch", batch);
+    getAndDelOption(arguments, "DLACore", DLACore);
+    getAndDelOption(arguments, "graph", graph);
+    getAndDelOption(arguments, "persistentCacheRatio", persistentCacheRatio);
+}
+
 void SafeBuilderOptions::parse(Arguments& arguments)
 {
     auto getFormats = [&arguments](std::vector<IOFormat>& formatsVector, const char* argument) {
@@ -1097,13 +1839,36 @@ void SafeBuilderOptions::parse(Arguments& arguments)
     getFormats(outputFormats, "--outputIOFormats");
     getAndDelOption(arguments, "--int8", int8);
     getAndDelOption(arguments, "--calib", calibFile);
-    getAndDelOption(arguments, "--consistency", consistency);
     getAndDelOption(arguments, "--std", standard);
+#if !TRT_WINML
     std::string pluginName;
     while (getAndDelOption(arguments, "--plugins", pluginName))
     {
+        sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl;
         plugins.emplace_back(pluginName);
     }
+    while (getAndDelOption(arguments, "--staticPlugins", pluginName))
+    {
+        plugins.emplace_back(pluginName);
+    }
+#endif
+    bool noBuilderCache{false};
+    getAndDelOption(arguments, "--noBuilderCache", noBuilderCache);
+    getAndDelOption(arguments, "--timingCacheFile", timingCacheFile);
+    getAndDelOption(arguments, "--avgTiming", avgTiming);
+    if (noBuilderCache)
+    {
+        timingCacheMode = TimingCacheMode::kDISABLE;
+    }
+    else if (!timingCacheFile.empty())
+    {
+        timingCacheMode = TimingCacheMode::kGLOBAL;
+    }
+    else
+    {
+        timingCacheMode = TimingCacheMode::kLOCAL;
+    }
+    getAndDelOption(arguments, "--sparsity", sparsity);
 }
 
 std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options)
@@ -1113,59 +1878,25 @@ std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options)
     os << "Format: ";
     switch (options.format)
     {
-    case ModelFormat::kCAFFE:
-    {
-        os << "Caffe";
-        break;
-    }
     case ModelFormat::kONNX:
     {
         os << "ONNX";
         break;
     }
-    case ModelFormat::kUFF:
-    {
-        os << "UFF";
-        break;
-    }
-    case ModelFormat::kANY:
-        os << "*";
-        break;
+    case ModelFormat::kANY: os << "*"; break;
     }
     os << std::endl << "Model: " << options.model << std::endl;
 
     return os;
 }
 
-std::ostream& operator<<(std::ostream& os, const UffInput& input)
-{
-    os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl;
-    for (const auto& i : input.inputs)
-    {
-        os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl;
-    }
-
-    return os;
-}
-
 std::ostream& operator<<(std::ostream& os, const ModelOptions& options)
 {
     os << options.baseModel;
     switch (options.baseModel.format)
     {
-    case ModelFormat::kCAFFE:
-    {
-        os << "Prototxt: " << options.prototxt << std::endl;
-        break;
-    }
-    case ModelFormat::kUFF:
-    {
-        os << options.uffInputs;
-        break;
-    }
     case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case
-    case ModelFormat::kANY:
-        break;
+    case ModelFormat::kANY: break;
     }
 
     os << "Output:";
@@ -1192,6 +1923,11 @@ std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype)
         os << "fp16";
         break;
     }
+    case nvinfer1::DataType::kBF16:
+    {
+        os << "bf16";
+        break;
+    }
     case nvinfer1::DataType::kINT8:
     {
         os << "int8";
@@ -1207,6 +1943,26 @@ std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype)
         os << "bool";
         break;
     }
+    case nvinfer1::DataType::kUINT8:
+    {
+        os << "uint8";
+        break;
+    }
+    case nvinfer1::DataType::kFP8:
+    {
+        os << "fp8";
+        break;
+    }
+    case nvinfer1::DataType::kINT64:
+    {
+        os << "int64";
+        break;
+    }
+    case nvinfer1::DataType::kINT4:
+    {
+        os << "int4";
+        break;
+    }
     }
     return os;
 }
@@ -1240,13 +1996,11 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format)
                 os << "hwc8";
                 break;
             }
-#if (NV_TENSORRT_MAJOR > 7)
             case nvinfer1::TensorFormat::kHWC16:
             {
                 os << "hwc16";
                 break;
             }
-#endif
             case nvinfer1::TensorFormat::kCHW4:
             {
                 os << "chw4";
@@ -1277,6 +2031,11 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format)
                 os << "hwc";
                 break;
             }
+            case nvinfer1::TensorFormat::kDHWC:
+            {
+                os << "dhwc";
+                break;
+            }
             case nvinfer1::TensorFormat::kDLA_LINEAR:
             {
                 os << "dla_linear";
@@ -1293,6 +2052,42 @@ std::ostream& operator<<(std::ostream& os, IOFormat const& format)
     return os;
 }
 
+std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType)
+{
+    switch (devType)
+    {
+    case nvinfer1::DeviceType::kGPU:
+    {
+        os << "GPU";
+        break;
+    }
+    case nvinfer1::DeviceType::kDLA:
+    {
+        os << "DLA";
+        break;
+    }
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, nvinfer1::RuntimePlatform platform)
+{
+    switch (platform)
+    {
+    case nvinfer1::RuntimePlatform::kSAME_AS_BUILD:
+    {
+        os << "Same As Build";
+        break;
+    }
+    case nvinfer1::RuntimePlatform::kWINDOWS_AMD64:
+    {
+        os << "Windows AMD64";
+        break;
+    }
+    }
+    return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const ShapeRange& dims)
 {
     int32_t i = 0;
@@ -1319,29 +2114,76 @@ std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecision
     return os;
 }
 
+std::ostream& operator<<(std::ostream& os, LayerDeviceTypes const& layerDeviceTypes)
+{
+    int32_t i = 0;
+    for (auto const& layerDevicePair : layerDeviceTypes)
+    {
+        os << (i++ ? ", " : "") << layerDevicePair.first << ":" << layerDevicePair.second;
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, StringSet const& stringSet)
+{
+    int64_t i = 0;
+    for (auto const& s : stringSet)
+    {
+        os << (i ? "," : "") << s;
+        ++i;
+    }
+    return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const BuildOptions& options)
 {
+    // if loadEngine is specified, BuildOptions are N/A
+    if (options.load)
+    {
+        os << std::endl;
+        return os;
+    }
     // clang-format off
     os << "=== Build Options ==="                                                                                       << std::endl <<
-
-          "Max batch: ";        printBatch(os, options.maxBatch)                                                        << std::endl <<
           "Memory Pools: ";     printMemoryPools(os, options)                                                           << std::endl <<
-          "minTiming: "      << options.minTiming                                                                       << std::endl <<
           "avgTiming: "      << options.avgTiming                                                                       << std::endl <<
           "Precision: ";        printPrecision(os, options)                                                             << std::endl <<
           "LayerPrecisions: " << options.layerPrecisions                                                                << std::endl <<
+          "Layer Device Types: " << options.layerDeviceTypes                                                            << std::endl <<
           "Calibration: "    << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl <<
           "Refit: "          << boolToEnabled(options.refittable)                                                       << std::endl <<
+          "Strip weights: "     << boolToEnabled(options.stripWeights)                                                  << std::endl <<
+          "Version Compatible: " << boolToEnabled(options.versionCompatible)                                            << std::endl <<
+#if !TRT_WINML
+          "ONNX Plugin InstanceNorm: " << boolToEnabled(options.pluginInstanceNorm)                                     << std::endl <<
+#endif
+          "TensorRT runtime: " << options.useRuntime                                                                    << std::endl <<
+          "Lean DLL Path: " << options.leanDLLPath                                                                      << std::endl <<
+          "Tempfile Controls: "; printTempfileControls(os, options.tempfileControls)                                    << std::endl <<
+          "Exclude Lean Runtime: " << boolToEnabled(options.excludeLeanRuntime)                                         << std::endl <<
           "Sparsity: ";         printSparsity(os, options)                                                              << std::endl <<
           "Safe mode: "      << boolToEnabled(options.safe)                                                             << std::endl <<
+          "Build DLA standalone loadable: " << boolToEnabled(options.buildDLAStandalone)                                << std::endl <<
+          "Allow GPU fallback for DLA: " << boolToEnabled(options.allowGPUFallback)                                     << std::endl <<
           "DirectIO mode: "  << boolToEnabled(options.directIO)                                                         << std::endl <<
           "Restricted mode: " << boolToEnabled(options.restricted)                                                      << std::endl <<
+          "Skip inference: "     << boolToEnabled(options.skipInference)                                                << std::endl <<
           "Save engine: "    << (options.save ? options.engine : "")                                                    << std::endl <<
           "Load engine: "    << (options.load ? options.engine : "")                                                    << std::endl <<
           "Profiling verbosity: " << static_cast<int32_t>(options.profilingVerbosity)                                   << std::endl <<
           "Tactic sources: ";   printTacticSources(os, options.enabledTactics, options.disabledTactics)                 << std::endl <<
-          "timingCacheMode: ";  printTimingCache(os, options)                                                           << std::endl <<
-          "timingCacheFile: " << options.timingCacheFile                                                                << std::endl;
+          "timingCacheMode: ";  printTimingCache(os, options.timingCacheMode)                                           << std::endl <<
+          "timingCacheFile: " << options.timingCacheFile                                                                << std::endl <<
+          "Enable Compilation Cache: "<< boolToEnabled(!options.disableCompilationCache) << std::endl <<
+          "errorOnTimingCacheMiss: "  << boolToEnabled(options.errorOnTimingCacheMiss)                                  << std::endl <<
+          "Preview Features: "; printPreviewFlags(os, options)                                                          << std::endl <<
+          "MaxAuxStreams: "   << options.maxAuxStreams                                                                  << std::endl <<
+          "BuilderOptimizationLevel: " << options.builderOptimizationLevel                                              << std::endl <<
+          "MaxTactics: " << options.maxTactics                                                                          << std::endl <<
+          "Calibration Profile Index: " << options.calibProfile                                                         << std::endl <<
+          "Weight Streaming: " << boolToEnabled(options.allowWeightStreaming)                                           << std::endl <<
+          "Runtime Platform: " << options.runtimePlatform                                                               << std::endl <<
+          "Debug Tensors: " << options.debugTensors                                                                     << std::endl;
     // clang-format on
 
     auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector<IOFormat> formats) {
@@ -1351,7 +2193,7 @@ std::ostream& operator<<(std::ostream& os, const BuildOptions& options)
         }
         else
         {
-            for(const auto& f : formats)
+            for (const auto& f : formats)
             {
                 os << direction << ": " << f << std::endl;
             }
@@ -1360,8 +2202,11 @@ std::ostream& operator<<(std::ostream& os, const BuildOptions& options)
 
     printIOFormats(os, "Input(s)", options.inputFormats);
     printIOFormats(os, "Output(s)", options.outputFormats);
-    printShapes(os, "build", options.shapes);
-    printShapes(os, "calibration", options.shapesCalib);
+    for (size_t i = 0; i < options.optProfiles.size(); i++)
+    {
+        printShapes(os, "build", options.optProfiles[i], i);
+    }
+    printShapes(os, "calibration", options.shapesCalib, -1);
 
     return os;
 }
@@ -1372,8 +2217,8 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options)
     os << "=== System Options ==="                                                                << std::endl <<
 
           "Device: "  << options.device                                                           << std::endl <<
-          "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "")           <<
-                         (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl;
+          "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "")           << std::endl;
+#if !TRT_WINML
     os << "Plugins:";
 
     for (const auto& p : options.plugins)
@@ -1382,13 +2227,32 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options)
     }
     os << std::endl;
 
+    os << "setPluginsToSerialize:";
+
+    for (const auto& p : options.setPluginsToSerialize)
+    {
+        os << " " << p;
+    }
+    os << std::endl;
+
+    os << "dynamicPlugins:";
+
+    for (const auto& p : options.dynamicPlugins)
+    {
+        os << " " << p;
+    }
+    os << std::endl;
+
+    os << "ignoreParsedPluginLibs: " << options.ignoreParsedPluginLibs << std::endl;
+    os << std::endl;
+#endif
     return os;
     // clang-format on
 }
 
 std::ostream& operator<<(std::ostream& os, const InferenceOptions& options)
 {
-// clang-format off
+    // clang-format off
     os << "=== Inference Options ==="                                     << std::endl <<
 
           "Batch: ";
@@ -1400,48 +2264,71 @@ std::ostream& operator<<(std::ostream& os, const InferenceOptions& options)
     {
                           os << "Explicit"                                << std::endl;
     }
-    printShapes(os, "inference", options.shapes);
-    os << "Iterations: "         << options.iterations                    << std::endl <<
-          "Duration: "           << options.duration   << "s (+ "
-                                 << options.warmup     << "ms warm up)"   << std::endl <<
-          "Sleep time: "         << options.sleep      << "ms"            << std::endl <<
-          "Idle time: "          << options.idle       << "ms"            << std::endl <<
-          "Streams: "            << options.streams                       << std::endl <<
-          "ExposeDMA: "          << boolToEnabled(!options.overlap)       << std::endl <<
-          "Data transfers: "     << boolToEnabled(!options.skipTransfers) << std::endl <<
-          "Spin-wait: "          << boolToEnabled(options.spin)           << std::endl <<
-          "Multithreading: "     << boolToEnabled(options.threads)        << std::endl <<
-          "CUDA Graph: "         << boolToEnabled(options.graph)          << std::endl <<
-          "Separate profiling: " << boolToEnabled(options.rerun)          << std::endl <<
-          "Time Deserialize: "   << boolToEnabled(options.timeDeserialize) << std::endl <<
-          "Time Refit: "         << boolToEnabled(options.timeRefit) << std::endl <<
-          "Skip inference: "     << boolToEnabled(options.skip)           << std::endl;
-
-// clang-format on
+    printShapes(os, "inference", options.shapes, options.optProfileIndex);
+
+    std::string wsBudget{"Disabled"};
+    if (options.weightStreamingBudget.bytes == WeightStreamingBudget::kAUTOMATIC)
+    {
+        wsBudget = "Automatic";
+    }
+    else if (options.weightStreamingBudget.bytes != WeightStreamingBudget::kDISABLE)
+    {
+        wsBudget = std::to_string(options.weightStreamingBudget.bytes) + " bytes";
+    }
+    else if (options.weightStreamingBudget.percent != WeightStreamingBudget::kDISABLE)
+    {
+        wsBudget = std::to_string(options.weightStreamingBudget.percent) + "%";
+    }
+
+    os << "Iterations: "                << options.iterations                                   << std::endl <<
+          "Duration: "                  << options.duration   << "s (+ "
+                                        << options.warmup     << "ms warm up)"                  << std::endl <<
+          "Sleep time: "                << options.sleep      << "ms"                           << std::endl <<
+          "Idle time: "                 << options.idle       << "ms"                           << std::endl <<
+          "Inference Streams: "         << options.infStreams                                   << std::endl <<
+          "ExposeDMA: "                 << boolToEnabled(!options.overlap)                      << std::endl <<
+          "Data transfers: "            << boolToEnabled(!options.skipTransfers)                << std::endl <<
+          "Spin-wait: "                 << boolToEnabled(options.spin)                          << std::endl <<
+          "Multithreading: "            << boolToEnabled(options.threads)                       << std::endl <<
+          "CUDA Graph: "                << boolToEnabled(options.graph)                         << std::endl <<
+          "Separate profiling: "        << boolToEnabled(options.rerun)                         << std::endl <<
+          "Time Deserialize: "          << boolToEnabled(options.timeDeserialize)               << std::endl <<
+          "Time Refit: "                << boolToEnabled(options.timeRefit)                     << std::endl <<
+          "NVTX verbosity: "            << static_cast<int32_t>(options.nvtxVerbosity)          << std::endl <<
+          "Persistent Cache Ratio: "    << static_cast<float>(options.persistentCacheRatio)     << std::endl <<
+          "Optimization Profile Index: "<< options.optProfileIndex                              << std::endl <<
+          "Weight Streaming Budget: "   << wsBudget                                             << std::endl;
+    // clang-format on
+
     os << "Inputs:" << std::endl;
     for (const auto& input : options.inputs)
     {
         os << input.first << "<-" << input.second << std::endl;
     }
 
+    os << "Debug Tensor Save Destinations:" << std::endl;
+    for (auto const& fileName : options.debugTensorFileNames)
+    {
+        os << fileName.first << ": " << fileName.second << std::endl;
+    }
+
     return os;
 }
 
 std::ostream& operator<<(std::ostream& os, const ReportingOptions& options)
 {
-// clang-format off
-    os << "=== Reporting Options ==="                                       << std::endl <<
-
-          "Verbose: "                     << boolToEnabled(options.verbose) << std::endl <<
-          "Averages: "                    << options.avgs << " inferences"  << std::endl <<
-          "Percentile: "                  << options.percentile             << std::endl <<
-          "Dump refittable layers:"       << boolToEnabled(options.refit)   << std::endl <<
-          "Dump output: "                 << boolToEnabled(options.output)  << std::endl <<
-          "Profile: "                     << boolToEnabled(options.profile) << std::endl <<
-          "Export timing to JSON file: "  << options.exportTimes            << std::endl <<
-          "Export output to JSON file: "  << options.exportOutput           << std::endl <<
-          "Export profile to JSON file: " << options.exportProfile          << std::endl;
-// clang-format on
+    // clang-format off
+    os << "=== Reporting Options ==="                                                     << std::endl <<
+          "Verbose: "                     << boolToEnabled(options.verbose)               << std::endl <<
+          "Averages: "                    << options.avgs << " inferences"                << std::endl <<
+          "Percentiles: "                 << joinValuesToString(options.percentiles, ",") << std::endl <<
+          "Dump refittable layers:"       << boolToEnabled(options.refit)                 << std::endl <<
+          "Dump output: "                 << boolToEnabled(options.output)                << std::endl <<
+          "Profile: "                     << boolToEnabled(options.profile)               << std::endl <<
+          "Export timing to JSON file: "  << options.exportTimes                          << std::endl <<
+          "Export output to JSON file: "  << options.exportOutput                         << std::endl <<
+          "Export profile to JSON file: " << options.exportProfile                        << std::endl;
+    // clang-format on
 
     return os;
 }
@@ -1461,7 +2348,7 @@ std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options)
         }
         else
         {
-            for(const auto& f : formats)
+            for (const auto& f : formats)
             {
                 os << direction << ": " << f << std::endl;
             }
@@ -1476,197 +2363,288 @@ std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options)
     {
         os << " + INT8";
     }
+    if (options.fp8)
+    {
+        os << " + FP8";
+    }
+    if (options.int4)
+    {
+        os << " + INT4";
+    }
     os << std::endl;
     os << "Calibration file: " << options.calibFile << std::endl;
     os << "Serialized Network: " << options.serialized << std::endl;
 
     printIOFormats(os, "Input(s)", options.inputFormats);
     printIOFormats(os, "Output(s)", options.outputFormats);
-
+#if !TRT_WINML
     os << "Plugins:";
     for (const auto& p : options.plugins)
     {
         os << " " << p;
     }
+#endif
+    os << "timingCacheMode: ";
+    printTimingCache(os, options.timingCacheMode) << std::endl;
+    os << "timingCacheFile: " << options.timingCacheFile << std::endl;
     os << std::endl;
     return os;
 }
 
 void BaseModelOptions::help(std::ostream& os)
 {
-// clang-format off
-    os << "  --uff=<file>                UFF model"                                             << std::endl <<
-          "  --onnx=<file>               ONNX model"                                            << std::endl <<
-          "  --model=<file>              Caffe model (default = no model, random weights used)" << std::endl;
-// clang-format on
-}
-
-void UffInput::help(std::ostream& os)
-{
-// clang-format off
-    os << "  --uffInput=<name>,X,Y,Z     Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified "
-                                                       "multiple times; at least one is required for UFF models" << std::endl <<
-          "  --uffNHWC                   Set if inputs are in the NHWC layout instead of NCHW (use "             <<
-                                                                    "X,Y,Z=H,W,C order in --uffInput)"           << std::endl;
-// clang-format on
+    // clang-format off
+    os << "  --onnx=<file>               ONNX model"                                            << std::endl;
+    // clang-format on
 }
 
 void ModelOptions::help(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== Model Options ==="                                                                                 << std::endl;
     BaseModelOptions::help(os);
-    os << "  --deploy=<file>             Caffe prototxt file"                                                     << std::endl <<
-          "  --output=<name>[,<name>]*   Output names (it can be specified multiple times); at least one output "
-                                                                                  "is required for UFF and Caffe" << std::endl;
-    UffInput::help(os);
-// clang-format on
+    // clang-format on
 }
 
 void BuildOptions::help(std::ostream& os)
 {
-// clang-format off
-    os << "=== Build Options ==="                                                                                                            "\n"
-          "  --maxBatch                  Set max batch size and build an implicit batch engine (default = same size as --batch)"             "\n"
-          "                              This option should not be used when the input model is ONNX or when dynamic shapes are provided."   "\n"
-          "  --minShapes=spec            Build with dynamic shapes using a profile with the min shapes provided"                             "\n"
-          "  --optShapes=spec            Build with dynamic shapes using a profile with the opt shapes provided"                             "\n"
-          "  --maxShapes=spec            Build with dynamic shapes using a profile with the max shapes provided"                             "\n"
-          "  --minShapesCalib=spec       Calibrate with dynamic shapes using a profile with the min shapes provided"                         "\n"
-          "  --optShapesCalib=spec       Calibrate with dynamic shapes using a profile with the opt shapes provided"                         "\n"
-          "  --maxShapesCalib=spec       Calibrate with dynamic shapes using a profile with the max shapes provided"                         "\n"
-          "                              Note: All three of min, opt and max shapes must be supplied."                                       "\n"
-          "                                    However, if only opt shapes is supplied then it will be expanded so"                          "\n"
-          "                                    that min shapes and max shapes are set to the same values as opt shapes."                     "\n"
-          "                                    Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')."                 "\n"
-          "                              Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128"                                   "\n"
-          "                              Each input shape is supplied as a key-value pair where key is the input name and"                   "\n"
-          "                              value is the dimensions (including the batch dimension) to be used for that input."                 "\n"
-          "                              Each key-value pair has the key and value separated using a colon (:)."                             "\n"
-          "                              Multiple input shapes can be provided via comma-separated key-value pairs."                         "\n"
-          "  --inputIOFormats=spec       Type and format of each of the input tensors (default = all inputs in fp32:chw)"                    "\n"
-          "                              See --outputIOFormats help for the grammar of type and format list."                                "\n"
-          "                              Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
-          "                                    inputs following the same order as network inputs ID (even if only one input"                 "\n"
-          "                                    needs specifying IO format) or set the type and format once for broadcasting."                "\n"
-          "  --outputIOFormats=spec      Type and format of each of the output tensors (default = all outputs in fp32:chw)"                  "\n"
-          "                              Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
-          "                                    outputs following the same order as network outputs ID (even if only one output"              "\n"
-          "                                    needs specifying IO format) or set the type and format once for broadcasting."                "\n"
-          "                              IO Formats: spec  ::= IOfmt[\",\"spec]"                                                             "\n"
-          "                                          IOfmt ::= type:fmt"                                                                     "\n"
-          "                                          type  ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                                         "\n"
-          "                                          fmt   ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n"
-          "  --workspace=N               Set workspace size in MiB."                                                                         "\n"
-          "  --memPoolSize=poolspec      Specify the size constraints of the designated memory pool(s) in MiB."                              "\n"
-          "                              Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n"
-          "                              Pool constraint: poolspec ::= poolfmt[\",\"poolspec]"                                               "\n"
-          "                                               poolfmt ::= pool:sizeInMiB"                                                        "\n"
-          "                                               pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\""             "\n"
-          "  --profilingVerbosity=mode   Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)"  "\n"
-          "  --minTiming=M               Set the minimum number of iterations used in kernel selection (default = "
-                                                                                                           << defaultMinTiming << ")"        "\n"
-          "  --avgTiming=M               Set the number of times averaged in each iteration for kernel selection (default = "
-                                                                                                           << defaultAvgTiming << ")"        "\n"
-          "  --refit                     Mark the engine as refittable. This will allow the inspection of refittable layers "                "\n"
-          "                              and weights within the engine."                                                                     "\n"
-          "  --sparsity=spec             Control sparsity (default = disabled). "                                                            "\n"
-          "                              Sparsity: spec ::= \"disable\", \"enable\", \"force\""                                              "\n"
-          "                              Note: Description about each of these options is as below"                                          "\n"
-          "                                    disable = do not enable sparse tactics in the builder (this is the default)"                  "\n"
-          "                                    enable  = enable sparse tactics in the builder (but these tactics will only be"               "\n"
-          "                                              considered if the weights have the right sparsity pattern)"                         "\n"
-          "                                    force   = enable sparse tactics in the builder and force-overwrite the weights to have"       "\n"
-          "                                              a sparsity pattern (even if you loaded a model yourself)"                           "\n"
-          "  --noTF32                    Disable tf32 precision (default is to enable tf32, in addition to fp32)"                            "\n"
-          "  --fp16                      Enable fp16 precision, in addition to fp32 (default = disabled)"                                    "\n"
-          "  --int8                      Enable int8 precision, in addition to fp32 (default = disabled)"                                    "\n"
-          "  --best                      Enable all precisions to achieve the best performance (default = disabled)"                         "\n"
-          "  --directIO                  Avoid reformatting at network boundaries. (default = disabled)"                                     "\n"
-          "  --precisionConstraints=spec Control precision constraint setting. (default = none)"                                             "\n"
-          "                                  Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\""                                "\n"
-          "                                  none = no constraints"                                                                          "\n"
-          "                                  prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible"    "\n"
-          "                                  obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail"          "\n"
-          "                                         otherwise"                                                                               "\n"
-          "  --layerPrecisions=spec      Control per-layer precision constraints. Effective only when precisionConstraints is set to"        "\n"
-          "                              \"obey\" or \"prefer\". (default = none)"                                                           "\n"
-          "                              The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a"     "\n"
-          "                              layerName to specify the default precision for all the unspecified layers."                         "\n"
-          "                              Per-layer precision spec ::= layerPrecision[\",\"spec]"                                             "\n"
-          "                                                  layerPrecision ::= layerName\":\"precision"                                     "\n"
-          "                                                  precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                             "\n"
-          "  --layerOutputTypes=spec     Control per-layer output type constraints. Effective only when precisionConstraints is set to"      "\n"
-          "                              \"obey\" or \"prefer\". (default = none)"                                                           "\n"
-          "                              The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a"     "\n"
-          "                              layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n"
-          "                              one output, then multiple types separated by \"+\" can be provided for this layer."                 "\n"
-          "                              Per-layer output type spec ::= layerOutputTypes[\",\"spec]"                                         "\n"
-          "                                                    layerOutputTypes ::= layerName\":\"type"                                      "\n"
-          "                                                    type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]"                     "\n"
-          "  --calib=<file>              Read INT8 calibration cache file"                                                                   "\n"
-          "  --safe                      Enable build safety certified engine"                                                               "\n"
-          "  --consistency               Perform consistency checking on safety certified engine"                                            "\n"
-          "  --restricted                Enable safety scope checking with kSAFETY_SCOPE build flag"                                         "\n"
-          "  --saveEngine=<file>         Save the serialized engine"                                                                         "\n"
-          "  --loadEngine=<file>         Load a serialized engine"                                                                           "\n"
-          "  --tacticSources=tactics     Specify the tactics to be used by adding (+) or removing (-) tactics from the default "             "\n"
-          "                              tactic sources (default = all available tactics)."                                                  "\n"
-          "                              Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics."                   "\n"
-          "                              Tactic Sources: tactics ::= [\",\"tactic]"                                                          "\n"
-          "                                              tactic  ::= (+|-)lib"                                                               "\n"
-          "                                              lib     ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\""                                     "\n"
-          "                              For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS"                    "\n"
-          "  --noBuilderCache            Disable timing cache in builder (default is to enable timing cache)"                                "\n"
-          "  --timingCacheFile=<file>    Save/load the serialized global timing cache"                                                       "\n"
+    // clang-format off
+    os << "=== Build Options ==="                                                                                                                   "\n"
+          "  --minShapes=spec                   Build with dynamic shapes using a profile with the min shapes provided"                             "\n"
+          "  --optShapes=spec                   Build with dynamic shapes using a profile with the opt shapes provided"                             "\n"
+          "  --maxShapes=spec                   Build with dynamic shapes using a profile with the max shapes provided"                             "\n"
+          "  --minShapesCalib=spec              Calibrate with dynamic shapes using a profile with the min shapes provided"                         "\n"
+          "  --optShapesCalib=spec              Calibrate with dynamic shapes using a profile with the opt shapes provided"                         "\n"
+          "  --maxShapesCalib=spec              Calibrate with dynamic shapes using a profile with the max shapes provided"                         "\n"
+          "                                     Note: All three of min, opt and max shapes must be supplied."                                       "\n"
+          "                                           However, if only opt shapes is supplied then it will be expanded so"                          "\n"
+          "                                           that min shapes and max shapes are set to the same values as opt shapes."                     "\n"
+          "                                           Input names can be wrapped with escaped single quotes (ex: 'Input:0')."                       "\n"
+          "                                     Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128"                                   "\n"
+          "                                     For scalars (0-D shapes), use input0:scalar or simply input0: with nothing after the colon."        "\n"
+          "                                     Each input shape is supplied as a key-value pair where key is the input name and"                   "\n"
+          "                                     value is the dimensions (including the batch dimension) to be used for that input."                 "\n"
+          "                                     Each key-value pair has the key and value separated using a colon (:)."                             "\n"
+          "                                     Multiple input shapes can be provided via comma-separated key-value pairs, and each input name can" "\n"
+          "                                     contain at most one wildcard ('*') character."                                                      "\n"
+          "  --inputIOFormats=spec              Type and format of each of the input tensors (default = all inputs in fp32:chw)"                    "\n"
+          "                                     See --outputIOFormats help for the grammar of type and format list."                                "\n"
+          "                                     Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
+          "                                           inputs following the same order as network inputs ID (even if only one input"                 "\n"
+          "                                           needs specifying IO format) or set the type and format once for broadcasting."                "\n"
+          "  --outputIOFormats=spec             Type and format of each of the output tensors (default = all outputs in fp32:chw)"                  "\n"
+          "                                     Note: If this option is specified, please set comma-separated types and formats for all"            "\n"
+          "                                           outputs following the same order as network outputs ID (even if only one output"              "\n"
+          "                                           needs specifying IO format) or set the type and format once for broadcasting."                "\n"
+          R"(                                     IO Formats: spec  ::= IOfmt[","spec])"                                                            "\n"
+          "                                                 IOfmt ::= type:fmt"                                                                     "\n"
+          R"(                                               type  ::= "fp32"|"fp16"|"bf16"|"int32"|"int64"|"int8"|"uint8"|"bool")"                  "\n"
+          R"(                                               fmt   ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)"                        "\n"
+          R"(                                                          "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])"                            "\n"
+          "  --memPoolSize=poolspec             Specify the size constraints of the designated memory pool(s)"                                      "\n"
+          "                                     Supports the following base-2 suffixes: " << getAvailableUnitSuffixes() << "."                      "\n"
+          "                                     If none of suffixes is appended, the defualt unit is in MiB."                                       "\n"
+          "                                     Note: Also accepts decimal sizes, e.g. 0.25M. Will be rounded down to the nearest integer bytes."   "\n"
+          "                                     In particular, for dlaSRAM the bytes will be rounded down to the nearest power of 2."               "\n"
+          R"(                                   Pool constraint: poolspec ::= poolfmt[","poolspec])"                                                "\n"
+          "                                                      poolfmt ::= pool:size"                                                             "\n"
+          R"(                                                    pool ::= "workspace"|"dlaSRAM"|"dlaLocalDRAM"|"dlaGlobalDRAM"|"tacticSharedMem")"  "\n"
+          "  --profilingVerbosity=mode          Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)." "\n"
+          "                                     Please only assign once."                                                                           "\n"
+          "  --avgTiming=M                      Set the number of times averaged in each iteration for kernel selection (default = "
+                                                                                                                  << defaultAvgTiming << ")"        "\n"
+          "  --refit                            Mark the engine as refittable. This will allow the inspection of refittable layers "                "\n"
+          "                                     and weights within the engine."                                                                     "\n"
+          "  --stripWeights                     Strip weights from plan. This flag works with either refit or refit with identical weights. Default""\n"
+          "                                     to latter, but you can switch to the former by enabling both --stripWeights and --refit at the same""\n"
+          "                                     time."                                                                                              "\n"
+          "  --stripAllWeights                  Alias for combining the --refit and --stripWeights options. It marks all weights as refittable,"    "\n"
+          "                                     disregarding any performance impact. Additionally, it strips all refittable weights after the "     "\n"
+          "                                     engine is built."                                                                                   "\n"
+          "  --weightless                       [Deprecated] this knob has been deprecated. Please use --stripWeights"                              "\n"
+          "  --versionCompatible, --vc          Mark the engine as version compatible. This allows the engine to be used with newer versions"       "\n"
+          "                                     of TensorRT on the same host OS, as well as TensorRT's dispatch and lean runtimes."                 "\n"
+#if !TRT_WINML
+          "  --pluginInstanceNorm, --pi         Set `kNATIVE_INSTANCENORM` to false in the ONNX parser. This will cause the ONNX parser to use"     "\n"
+          "                                     a plugin InstanceNorm implementation over the native implementation when parsing."                  "\n"
+#endif
+          R"(  --useRuntime=runtime               TensorRT runtime to execute engine. "lean" and "dispatch" require loading VC engine and do)"      "\n"
+          "                                     not support building an engine."                                                                    "\n"
+          R"(                                           runtime::= "full"|"lean"|"dispatch")"                                                       "\n"
+          "  --leanDLLPath=<file>               External lean runtime DLL to use in version compatiable mode."                                      "\n"
+          "  --excludeLeanRuntime               When --versionCompatible is enabled, this flag indicates that the generated engine should"          "\n"
+          "                                     not include an embedded lean runtime. If this is set, the user must explicitly specify a"           "\n"
+          "                                     valid lean runtime to use when loading the engine."     "\n"
+          "  --sparsity=spec                    Control sparsity (default = disabled). "                                                            "\n"
+          R"(                                   Sparsity: spec ::= "disable", "enable", "force")"                                                   "\n"
+          "                                     Note: Description about each of these options is as below"                                          "\n"
+          "                                           disable = do not enable sparse tactics in the builder (this is the default)"                  "\n"
+          "                                           enable  = enable sparse tactics in the builder (but these tactics will only be"               "\n"
+          "                                                     considered if the weights have the right sparsity pattern)"                         "\n"
+          "                                           force   = enable sparse tactics in the builder and force-overwrite the weights to have"       "\n"
+          "                                                     a sparsity pattern (even if you loaded a model yourself)"                           "\n"
+          "                                                     [Deprecated] this knob has been deprecated."                                        "\n"
+          "                                                     Please use <polygraphy surgeon prune> to rewrite the weights."                      "\n"
+          "  --noTF32                           Disable tf32 precision (default is to enable tf32, in addition to fp32)"                            "\n"
+          "  --fp16                             Enable fp16 precision, in addition to fp32 (default = disabled)"                                    "\n"
+          "  --bf16                             Enable bf16 precision, in addition to fp32 (default = disabled)"                                    "\n"
+          "  --int8                             Enable int8 precision, in addition to fp32 (default = disabled)"                                    "\n"
+          "  --fp8                              Enable fp8 precision, in addition to fp32 (default = disabled)"                                     "\n"
+          "  --int4                             Enable int4 precision, in addition to fp32 (default = disabled)"                                     "\n"
+          "  --best                             Enable all precisions to achieve the best performance (default = disabled)"                         "\n"
+          "  --stronglyTyped                    Create a strongly typed network. (default = disabled)"                                              "\n"
+          "  --directIO                         Avoid reformatting at network boundaries. (default = disabled)"                                     "\n"
+          "  --precisionConstraints=spec        Control precision constraint setting. (default = none)"                                             "\n"
+          R"(                                       Precision Constraints: spec ::= "none" | "obey" | "prefer")"                                    "\n"
+          "                                         none = no constraints"                                                                          "\n"
+          "                                         prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible"    "\n"
+          "                                         obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail"          "\n"
+          "                                                otherwise"                                                                               "\n"
+          "  --layerPrecisions=spec             Control per-layer precision constraints. Effective only when precisionConstraints is set to"        "\n"
+          R"(                                   "obey" or "prefer". (default = none))"                                                              "\n"
+          R"(                                   The specs are read left-to-right, and later ones override earlier ones. Each layer name can)"       "\n"
+          "                                     contain at most one wildcard ('*') character."                                                      "\n"
+          R"(                                   Per-layer precision spec ::= layerPrecision[","spec])"                                              "\n"
+          R"(                                                       layerPrecision ::= layerName":"precision)"                                      "\n"
+          R"(                                                       precision ::= "fp32"|"fp16"|"bf16"|"int32"|"int8")"                             "\n"
+          "  --layerOutputTypes=spec            Control per-layer output type constraints. Effective only when precisionConstraints is set to"      "\n"
+          R"(                                   "obey" or "prefer". (default = none)"                                                               "\n"
+          R"(                                   The specs are read left-to-right, and later ones override earlier ones. Each layer name can)"       "\n"
+          "                                     contain at most one wildcard ('*') character. If a layer has more than"                             "\n"
+          R"(                                   one output, then multiple types separated by "+" can be provided for this layer.)"                  "\n"
+          R"(                                   Per-layer output type spec ::= layerOutputTypes[","spec])"                                          "\n"
+          R"(                                                         layerOutputTypes ::= layerName":"type)"                                       "\n"
+          R"(                                                         type ::= "fp32"|"fp16"|"bf16"|"int32"|"int8"["+"type])"                       "\n"
+          "  --layerDeviceTypes=spec            Specify layer-specific device type."                                                                "\n"
+          "                                     The specs are read left-to-right, and later ones override earlier ones. If a layer does not have"   "\n"
+          "                                     a device type specified, the layer will opt for the default device type."                           "\n"
+          R"(                                   Per-layer device type spec ::= layerDeviceTypePair[","spec])"                                       "\n"
+          R"(                                                         layerDeviceTypePair ::= layerName":"deviceType)"                              "\n"
+          R"(                                                           deviceType ::= "GPU"|"DLA")"                                                "\n"
+          "  --calib=<file>                     Read INT8 calibration cache file"                                                                   "\n"
+          "  --safe                             Enable build safety certified engine, if DLA is enable, --buildDLAStandalone will be specified"     "\n"
+          "                                     automatically (default = disabled)"                                                                 "\n"
+          "  --buildDLAStandalone               Enable build DLA standalone loadable which can be loaded by cuDLA, when this option is enabled, "   "\n"
+          "                                     --allowGPUFallback is disallowed and --skipInference is enabled by default. Additionally, "         "\n"
+          "                                     specifying --inputIOFormats and --outputIOFormats restricts I/O data type and memory layout"        "\n"
+          "                                     (default = disabled)"        "\n"
+          "  --allowGPUFallback                 When DLA is enabled, allow GPU fallback for unsupported layers (default = disabled)"                "\n"
+          "  --restricted                       Enable safety scope checking with kSAFETY_SCOPE build flag"                                         "\n"
+          "  --saveEngine=<file>                Save the serialized engine"                                                                         "\n"
+          "  --loadEngine=<file>                Load a serialized engine"                                                                           "\n"
+          "  --getPlanVersionOnly               Print TensorRT version when loaded plan was created. Works without deserialization of the plan."    "\n"
+          "                                     Use together with --loadEngine. Supported only for engines created with 8.6 and forward."           "\n"
+          "  --tacticSources=tactics            Specify the tactics to be used by adding (+) or removing (-) tactics from the default "             "\n"
+          "                                     tactic sources (default = all available tactics)."                                                  "\n"
+          "                                     Note: Currently only cuDNN, cuBLAS, cuBLAS-LT, and edge mask convolutions are listed as optional"   "\n"
+          "                                           tactics."                                                                                     "\n"
+          R"(                                   Tactic Sources: tactics ::= [","tactic])"                                                           "\n"
+          "                                                     tactic  ::= (+|-)lib"                                                               "\n"
+          R"(                                                   lib     ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS")"                 "\n"
+          R"(                                                               |"JIT_CONVOLUTIONS")"                                                   "\n"
+          "                                     For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS"                    "\n"
+          "  --noBuilderCache                   Disable timing cache in builder (default is to enable timing cache)"                                "\n"
+          "  --noCompilationCache               Disable Compilation cache in builder, and the cache is part of timing cache (default is to enable compilation cache)"                                                "\n"
+          "  --errorOnTimingCacheMiss           Emit error when a tactic being timed is not present in the timing cache (default = false)"          "\n"
+          "  --timingCacheFile=<file>           Save/load the serialized global timing cache"                                                       "\n"
+          "  --preview=features                 Specify preview feature to be used by adding (+) or removing (-) preview features from the default" "\n"
+          R"(                                   Preview Features: features ::= [","feature])"                                                       "\n"
+          "                                                       feature  ::= (+|-)flag"                                                           "\n"
+          R"(                                                     flag     ::= "aliasedPluginIO1003")"                                              "\n"
+          R"(                                                                  |"profileSharing0806")"                                              "\n"
+          "  --builderOptimizationLevel         Set the builder optimization level. (default is 3)"                                                 "\n"
+          "                                     Higher level allows TensorRT to spend more building time for more optimization options."            "\n"
+          "                                     Valid values include integers from 0 to the maximum optimization level, which is currently 5."      "\n"
+          "  --maxTactics                       Set the maximum number of tactics to time when there is a choice of tactics. (default is -1)"       "\n"
+          "                                     Larger number of tactics allow TensorRT to spend more building time on evaluating tactics."         "\n"
+          "                                     Default value -1 means TensorRT can decide the number of tactics based on its own heuristic."       "\n"
+          "  --hardwareCompatibilityLevel=mode  Make the engine file compatible with other GPU architectures. (default = none)"                     "\n"
+          R"(                                   Hardware Compatibility Level: mode ::= "none" | "ampere+")"                                         "\n"
+          "                                         none = no compatibility"                                                                        "\n"
+          "                                         ampere+ = compatible with Ampere and newer GPUs"                                                "\n"
+          "  --runtimePlatform=platform         Set the target platform for runtime execution. (default = SameAsBuild)"                             "\n"
+          "                                     When this option is enabled, --skipInference is enabled by default."                                "\n"
+          R"(                                   RuntimePlatfrom: platform ::= "SameAsBuild" | "WindowsAMD64")"                                      "\n"
+          "                                         SameAsBuild = no requirement for cross-platform compatibility."                                 "\n"
+          "                                         WindowsAMD64 = set the target platform for engine execution as Windows AMD64 system"            "\n"
+          "  --tempdir=<dir>                    Overrides the default temporary directory TensorRT will use when creating temporary files."         "\n"
+          "                                     See IRuntime::setTemporaryDirectory API documentation for more information."                        "\n"
+          "  --tempfileControls=controls        Controls what TensorRT is allowed to use when creating temporary executable files."                 "\n"
+          "                                     Should be a comma-separated list with entries in the format (in_memory|temporary):(allow|deny)."    "\n"
+          "                                     in_memory: Controls whether TensorRT is allowed to create temporary in-memory executable files."    "\n"
+          "                                     temporary: Controls whether TensorRT is allowed to create temporary executable files in the"        "\n"
+          "                                                filesystem (in the directory given by --tempdir)."                                       "\n"
+          "                                     For example, to allow in-memory files and disallow temporary files:"                                "\n"
+          "                                         --tempfileControls=in_memory:allow,temporary:deny"                                              "\n"
+          R"(                                     If a flag is unspecified, the default behavior is "allow".)"                                      "\n"
+          "  --maxAuxStreams=N                  Set maximum number of auxiliary streams per inference stream that TRT is allowed to use to run "    "\n"
+          "                                     kernels in parallel if the network contains ops that can run in parallel, with the cost of more "   "\n"
+          "                                     memory usage. Set this to 0 for optimal memory usage. (default = using heuristics)"                 "\n"
+          "  --profile                          Build with dynamic shapes using a profile with the min/max/opt shapes provided. Can be specified"   "\n"
+          "                                         multiple times to create multiple profiles with contiguous index."                              "\n"
+          "                                     (ex: --profile=0 --minShapes=<spec> --optShapes=<spec> --maxShapes=<spec> --profile=1 ...)"         "\n"
+          "  --calibProfile                     Select the optimization profile to calibrate by index. (default = "
+                                                                                                                << defaultOptProfileIndex << ")"    "\n"
+          "  --allowWeightStreaming             Enable a weight streaming engine. Must be specified with --stronglyTyped. TensorRT will disable"    "\n"
+          "                                     weight streaming at runtime unless --weightStreamingBudget is specified."                           "\n"
+          "  --markDebug                        Specify list of names of tensors to be marked as debug tensors. Separate names with a comma"        "\n"
           ;
-// clang-format on
+    // clang-format on
     os << std::flush;
 }
 
 void SystemOptions::help(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== System Options ==="                                                                         << std::endl <<
           "  --device=N                  Select cuda device N (default = "         << defaultDevice << ")" << std::endl <<
           "  --useDLACore=N              Select DLA core N for layers that support DLA (default = none)"   << std::endl <<
-          "  --allowGPUFallback          When DLA is enabled, allow GPU fallback for unsupported layers "
-                                                                                    "(default = disabled)" << std::endl;
-    os << "  --plugins                   Plugin library (.so) to load (can be specified multiple times)"   << std::endl;
-// clang-format on
+#if TRT_WINML
+          std::endl;
+#else
+          "  --staticPlugins             Plugin library (.so) to load statically (can be specified multiple times)" << std::endl <<
+          "  --dynamicPlugins            Plugin library (.so) to load dynamically and may be serialized with the engine if they are included in --setPluginsToSerialize (can be specified multiple times)" << std::endl <<
+          "  --setPluginsToSerialize     Plugin library (.so) to be serialized with the engine (can be specified multiple times)" << std::endl <<
+          "  --ignoreParsedPluginLibs    By default, when building a version-compatible engine, plugin libraries specified by the ONNX parser " << std::endl <<
+          "                              are implicitly serialized with the engine (unless --excludeLeanRuntime is specified) and loaded dynamically. " << std::endl <<
+          "                              Enable this flag to ignore these plugin libraries instead." << std::endl;
+#endif
+    // clang-format on
 }
 
 void InferenceOptions::help(std::ostream& os)
 {
     // clang-format off
     os << "=== Inference Options ==="                                                                                                << std::endl <<
-          "  --batch=N                   Set batch size for implicit batch engines (default = "              << defaultBatch << ")"  << std::endl <<
-          "                              This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl <<
-          "                              shapes are provided when the engine is built."                                              << std::endl <<
           "  --shapes=spec               Set input shapes for dynamic shapes inference inputs."                                      << std::endl <<
-          "                              Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')."         << std::endl <<
+          R"(                              Note: Input names can be wrapped with escaped single quotes (ex: 'Input:0').)"            << std::endl <<
           "                              Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128"                          << std::endl <<
+          "                              For scalars (0-D shapes), use input0:scalar or simply input0: with nothing after the colon."<< std::endl <<
           "                              Each input shape is supplied as a key-value pair where key is the input name and"           << std::endl <<
           "                              value is the dimensions (including the batch dimension) to be used for that input."         << std::endl <<
           "                              Each key-value pair has the key and value separated using a colon (:)."                     << std::endl <<
-          "                              Multiple input shapes can be provided via comma-separated key-value pairs."                 << std::endl <<
+          "                              Multiple input shapes can be provided via comma-separated key-value pairs, and each input " << std::endl <<
+          "                              name can contain at most one wildcard ('*') character."                                     << std::endl <<
           "  --loadInputs=spec           Load input values from files (default = generate random inputs). Input names can be "
                                                                                        "wrapped with single quotes (ex: 'Input:0')"  << std::endl <<
-          "                              Input values spec ::= Ival[\",\"spec]"                                                      << std::endl <<
-          "                                           Ival ::= name\":\"file"                                                        << std::endl <<
+          R"(                            Input values spec ::= Ival[","spec])"                                                       << std::endl <<
+          R"(                                         Ival ::= name":"file)"                                                         << std::endl <<
+          "                              Consult the README for more information on generating files for custom inputs."             << std::endl <<
           "  --iterations=N              Run at least N inference iterations (default = "               << defaultIterations << ")"  << std::endl <<
           "  --warmUp=N                  Run for N milliseconds to warmup before measuring performance (default = "
                                                                                                             << defaultWarmUp << ")"  << std::endl <<
           "  --duration=N                Run performance measurements for at least N seconds wallclock time (default = "
                                                                                                           << defaultDuration << ")"  << std::endl <<
+          "                              If -1 is specified, inference will keep running unless stopped manually"                    << std::endl <<
           "  --sleepTime=N               Delay inference start with a gap of N milliseconds between launch and compute "
                                                                                                "(default = " << defaultSleep << ")"  << std::endl <<
           "  --idleTime=N                Sleep N milliseconds between two continuous iterations"
                                                                                                "(default = " << defaultIdle << ")"   << std::endl <<
-          "  --streams=N                 Instantiate N engines to use concurrently (default = "            << defaultStreams << ")"  << std::endl <<
+          "  --infStreams=N              Instantiate N execution contexts to run inference concurrently "
+                                                                                             "(default = " << defaultStreams << ")"  << std::endl <<
           "  --exposeDMA                 Serialize DMA transfers to and from device (default = disabled)."                           << std::endl <<
           "  --noDataTransfers           Disable DMA transfers to and from device (default = enabled)."                              << std::endl <<
-          "  --useManagedMemory          Use managed memory instead of seperate host and device allocations (default = disabled)."   << std::endl <<
+          "  --useManagedMemory          Use managed memory instead of separate host and device allocations (default = disabled)."   << std::endl <<
           "  --useSpinWait               Actively synchronize on GPU events. This option may decrease synchronization time but "
                                                                              "increase CPU usage and power (default = disabled)"     << std::endl <<
           "  --threads                   Enable multithreading to drive engines with independent threads"
@@ -1677,42 +2655,84 @@ void InferenceOptions::help(std::ostream& os)
           "  --timeRefit                 Time the amount of time it takes to refit the engine before inference."                     << std::endl <<
           "  --separateProfileRun        Do not attach the profiler in the benchmark run; if profiling is enabled, a second "
                                                                                 "profile run will be executed (default = disabled)"  << std::endl <<
-          "  --buildOnly                 Skip inference perf measurement (default = disabled)"                                       << std::endl;
+          "  --skipInference             Exit after the engine has been built and skip inference perf measurement "
+                                                                                                             "(default = disabled)"  << std::endl <<
+          "  --persistentCacheRatio      Set the persistentCacheLimit in ratio, 0.5 represent half of max persistent L2 size "
+                                                                                                                    "(default = 0)"  << std::endl <<
+          "  --useProfile                Set the optimization profile for the inference context "
+                                                                                   "(default = " << defaultOptProfileIndex << " )."  << std::endl <<
+          "  --allocationStrategy=spec   Specify how the internal device memory for inference is allocated."                         << std::endl <<
+          R"(                            Strategy: spec ::= "static", "profile", "runtime")"                                         << std::endl <<
+          "                                  static = Allocate device memory based on max size across all profiles."                 << std::endl <<
+          "                                  profile = Allocate device memory based on max size of the current profile."             << std::endl <<
+          "                                  runtime = Allocate device memory based on the actual input shapes."                     << std::endl <<
+          "  --saveDebugTensors          Specify list of names of tensors to turn on the debug state"                                << std::endl <<
+          "                              and filename to save raw outputs to."                                                       << std::endl <<
+          "                              These tensors must be specified as debug tensors during build time."                        << std::endl <<
+          R"(                            Input values spec ::= Ival[","spec])"                                                       << std::endl <<
+          R"(                                         Ival ::= name":"file)"                                                         << std::endl <<
+          "  --weightStreamingBudget     Set the maximum amount of GPU memory TensorRT is allowed to use for weights."               << std::endl <<
+          "                              It can take on the following values:"                                                       << std::endl <<
+          "                                -2: (default) Disable weight streaming at runtime."                                       << std::endl <<
+          "                                -1: TensorRT will automatically decide the budget."                                       << std::endl <<
+          "                                 0-100%: Percentage of streamable weights that reside on the GPU."                        << std::endl <<
+          "                                         0% saves the most memory but will have the worst performance."                   << std::endl <<
+          "                                         Requires the % character."                                                       << std::endl <<
+          "                                >=0B: The exact amount of streamable weights that reside on the GPU. Supports the "       << std::endl <<
+          "                                     following base-2 suffixes: " << getAvailableUnitSuffixes() << "."                    << std::endl;
     // clang-format on
 }
 
 void ReportingOptions::help(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== Reporting Options ==="                                                                    << std::endl <<
           "  --verbose                   Use verbose logging (default = false)"                          << std::endl <<
           "  --avgRuns=N                 Report performance measurements averaged over N consecutive "
                                                        "iterations (default = " << defaultAvgRuns << ")" << std::endl <<
-          "  --percentile=P              Report performance for the P percentage (0<=P<=100, 0 "
+          "  --percentile=P1,P2,P3,...   Report performance for the P1,P2,P3,... percentages (0<=P_i<=100, 0 "
                                         "representing max perf, and 100 representing min perf; (default"
-                                                                      " = " << defaultPercentile << "%)" << std::endl <<
+                                            " = " << joinValuesToString(defaultPercentiles, ",") << "%)" << std::endl <<
           "  --dumpRefit                 Print the refittable layers and weights from a refittable "
                                         "engine"                                                         << std::endl <<
           "  --dumpOutput                Print the output tensor(s) of the last inference iteration "
                                                                                   "(default = disabled)" << std::endl <<
+          "  --dumpRawBindingsToFile     Print the input/output tensor(s) of the last inference iteration to file"
+                                                                                  "(default = disabled)" << std::endl <<
           "  --dumpProfile               Print profile information per layer (default = disabled)"       << std::endl <<
           "  --dumpLayerInfo             Print layer information of the engine to console "
                                                                                 "(default = disabled)"   << std::endl <<
+          "  --dumpOptimizationProfile   Print the optimization profile(s) information "
+                                                                                "(default = disabled)"   << std::endl <<
           "  --exportTimes=<file>        Write the timing results in a json file (default = disabled)"   << std::endl <<
           "  --exportOutput=<file>       Write the output tensors to a json file (default = disabled)"   << std::endl <<
           "  --exportProfile=<file>      Write the profile information per layer in a json file "
                                                                               "(default = disabled)"     << std::endl <<
           "  --exportLayerInfo=<file>    Write the layer information of the engine in a json file "
                                                                               "(default = disabled)"     << std::endl;
-// clang-format on
+    // clang-format on
+}
+
+void TaskInferenceOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "=== Task Inference Options ==="                                                                                           << std::endl <<
+          "  engine=<file>               Specify a serialized engine for this task"                                                  << std::endl <<
+          "  device=N                    Specify a GPU device for this task"                                                         << std::endl <<
+          "  DLACore=N                   Specify a DLACore for this task"                                                            << std::endl <<
+          "  batch=N                     Set batch size for implicit batch engines (default = "              << defaultBatch << ")"  << std::endl <<
+          "                              This option should not be used for explicit batch engines"                                  << std::endl <<
+          "  graph=1                     Use cuda graph for this task"                                                               << std::endl <<
+          "  persistentCacheRatio=[0-1]  Set the persistentCacheLimit ratio for this task                            (default = 0)"  << std::endl;
+    // clang-format on
 }
 
 void helpHelp(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== Help ==="                                     << std::endl <<
           "  --help, -h                  Print this message" << std::endl;
-// clang-format on
+    // clang-format on
 }
 
 void AllOptions::help(std::ostream& os)
@@ -1723,19 +2743,6 @@ void AllOptions::help(std::ostream& os)
     os << std::endl;
     InferenceOptions::help(os);
     os << std::endl;
-// clang-format off
-    os << "=== Build and Inference Batch Options ==="                                                                   << std::endl <<
-          "                              When using implicit batch, the max batch size of the engine, if not given, "   << std::endl <<
-          "                              is set to the inference batch size;"                                           << std::endl <<
-          "                              when using explicit batch, if shapes are specified only for inference, they "  << std::endl <<
-          "                              will be used also as min/opt/max in the build profile; if shapes are "         << std::endl <<
-          "                              specified only for the build, the opt shapes will be used also for inference;" << std::endl <<
-          "                              if both are specified, they must be compatible; and if explicit batch is "     << std::endl <<
-          "                              enabled but neither is specified, the model must provide complete static"      << std::endl <<
-          "                              dimensions, including batch size, for all inputs"                              << std::endl <<
-          "                              Using ONNX models automatically forces explicit batch."                        << std::endl <<
-    std::endl;
-    // clang-format on
     ReportingOptions::help(os);
     os << std::endl;
     SystemOptions::help(os);
@@ -1745,7 +2752,7 @@ void AllOptions::help(std::ostream& os)
 
 void SafeBuilderOptions::printHelp(std::ostream& os)
 {
-// clang-format off
+    // clang-format off
     os << "=== Mandatory ==="                                                                                                                << std::endl <<
           "  --onnx=<file>               ONNX model"                                                                                         << std::endl <<
           " "                                                                                                                                << std::endl <<
@@ -1759,20 +2766,34 @@ void SafeBuilderOptions::printHelp(std::ostream& os)
           "                              Note: If this option is specified, please set comma-separated types and formats for all"            << std::endl <<
           "                                    outputs following the same order as network outputs ID (even if only one output"              << std::endl <<
           "                                    needs specifying IO format) or set the type and format once for broadcasting."                << std::endl <<
-          "                              IO Formats: spec  ::= IOfmt[\",\"spec]"                                                             << std::endl <<
+          R"(                            IO Formats: spec  ::= IOfmt[","spec])"                                                              << std::endl <<
           "                                          IOfmt ::= type:fmt"                                                                     << std::endl <<
-          "                                          type  ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                                         << std::endl <<
-          "                                          fmt   ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl <<
+          R"(                                          type  ::= "fp32"|"fp16"|"int32"|"int8")"                                              << std::endl <<
+          R"(                                          fmt   ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)"                      << std::endl <<
+          R"(                                                     "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])"                          << std::endl <<
           "  --int8                      Enable int8 precision, in addition to fp16 (default = disabled)"                                    << std::endl <<
-          "  --consistency               Enable consistency check for serialized engine, (default = disabled)"                               << std::endl <<
           "  --std                       Build standard serialized engine, (default = disabled)"                                             << std::endl <<
           "  --calib=<file>              Read INT8 calibration cache file"                                                                   << std::endl <<
           "  --serialized=<file>         Save the serialized network"                                                                        << std::endl <<
-          "  --plugins                   Plugin library (.so) to load (can be specified multiple times)"                                     << std::endl <<
+#if !TRT_WINML
+          "  --staticPlugins             Plugin library (.so) to load statically (can be specified multiple times)"                          << std::endl <<
+#endif
           "  --verbose or -v             Use verbose logging (default = false)"                                                              << std::endl <<
           "  --help or -h                Print this message"                                                                                 << std::endl <<
-          " "                                                                                                                                << std::endl;
-// clang-format on
+          "  --noBuilderCache            Disable timing cache in builder (default is to enable timing cache)"                                << std::endl <<
+          "  --timingCacheFile=<file>    Save/load the serialized global timing cache"                                                       << std::endl <<
+          "  --sparsity=spec             Control sparsity (default = disabled). "                                                            << std::endl <<
+          R"(                              Sparsity: spec ::= "disable", "enable", "force")"                                                 << std::endl <<
+          "                              Note: Description about each of these options is as below"                                          << std::endl <<
+          "                                    disable = do not enable sparse tactics in the builder (this is the default)"                  << std::endl <<
+          "                                    enable  = enable sparse tactics in the builder (but these tactics will only be"               << std::endl <<
+          "                                              considered if the weights have the right sparsity pattern)"                         << std::endl <<
+          "                                    force   = enable sparse tactics in the builder and force-overwrite the weights to have"       << std::endl <<
+          "                                              a sparsity pattern"                                                                 << std::endl <<
+          "  --avgTiming=M               Set the number of times averaged in each iteration for kernel selection (default = "                << std::endl <<
+          ""                                                                                               << defaultAvgTiming << ")"        << std::endl <<
+          ""                                                                                                                                 << std::endl;
+    // clang-format on
 }
 
 } // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleOptions.h b/src/Detector/tensorrt_yolo/common/sampleOptions.h
index 8975e1ea..8ca0a655 100644
--- a/src/Detector/tensorrt_yolo/common/sampleOptions.h
+++ b/src/Detector/tensorrt_yolo/common/sampleOptions.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -23,6 +24,7 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -32,9 +34,10 @@ namespace sample
 {
 
 // Build default params
-constexpr int32_t maxBatchNotProvided{0};
-constexpr int32_t defaultMinTiming{1};
 constexpr int32_t defaultAvgTiming{8};
+constexpr int32_t defaultMaxAuxStreams{-1};
+constexpr int32_t defaultBuilderOptimizationLevel{-1};
+constexpr int32_t defaultMaxTactics{-1};
 
 // System default params
 constexpr int32_t defaultDevice{0};
@@ -44,14 +47,16 @@ constexpr int32_t defaultBatch{1};
 constexpr int32_t batchNotProvided{0};
 constexpr int32_t defaultStreams{1};
 constexpr int32_t defaultIterations{10};
+constexpr int32_t defaultOptProfileIndex{0};
 constexpr float defaultWarmUp{200.F};
 constexpr float defaultDuration{3.F};
 constexpr float defaultSleep{};
 constexpr float defaultIdle{};
+constexpr float defaultPersistentCacheRatio{0};
 
 // Reporting default params
 constexpr int32_t defaultAvgRuns{10};
-constexpr float defaultPercentile{99};
+constexpr std::array<float, 3> defaultPercentiles{90, 95, 99};
 
 enum class PrecisionConstraints
 {
@@ -63,9 +68,7 @@ enum class PrecisionConstraints
 enum class ModelFormat
 {
     kANY,
-    kCAFFE,
-    kONNX,
-    kUFF
+    kONNX
 };
 
 enum class SparsityFlag
@@ -82,7 +85,55 @@ enum class TimingCacheMode
     kGLOBAL
 };
 
-using Arguments = std::unordered_multimap<std::string, std::string>;
+enum class MemoryAllocationStrategy
+{
+    kSTATIC,  //< Allocate device memory based on max size across all profiles.
+    kPROFILE, //< Allocate device memory based on max size of the current profile.
+    kRUNTIME, //< Allocate device memory based on the current input shapes.
+};
+
+//!
+//! \enum RuntimeMode
+//!
+//! \brief Used to dictate which TensorRT runtime library to dynamically load.
+//!
+enum class RuntimeMode
+{
+    //! Maps to libnvinfer.so or nvinfer.dll
+    kFULL,
+
+    //! Maps to libnvinfer_dispatch.so or nvinfer_dispatch.dll
+    kDISPATCH,
+
+    //! Maps to libnvinfer_lean.so or nvinfer_lean.dll
+    kLEAN,
+};
+
+inline std::ostream& operator<<(std::ostream& os, RuntimeMode const mode)
+{
+    switch (mode)
+    {
+    case RuntimeMode::kFULL:
+    {
+        os << "full";
+        break;
+    }
+    case RuntimeMode::kDISPATCH:
+    {
+        os << "dispatch";
+        break;
+    }
+    case RuntimeMode::kLEAN:
+    {
+        os << "lean";
+        break;
+    }
+    }
+
+    return os;
+}
+
+using Arguments = std::unordered_multimap<std::string, std::pair<std::string, int32_t>>;
 
 using IOFormat = std::pair<nvinfer1::DataType, nvinfer1::TensorFormats>;
 
@@ -90,135 +141,201 @@ using ShapeRange = std::array<std::vector<int32_t>, nvinfer1::EnumMax<nvinfer1::
 
 using LayerPrecisions = std::unordered_map<std::string, nvinfer1::DataType>;
 using LayerOutputTypes = std::unordered_map<std::string, std::vector<nvinfer1::DataType>>;
+using LayerDeviceTypes = std::unordered_map<std::string, nvinfer1::DeviceType>;
 
-struct Options
-{
-    virtual void parse(Arguments& arguments) = 0;
-};
+using StringSet = std::unordered_set<std::string>;
 
-struct BaseModelOptions : public Options
+class WeightStreamingBudget
 {
-    ModelFormat format{ModelFormat::kANY};
-    std::string model;
+public:
+    static constexpr int64_t kDISABLE{-2};
+    static constexpr int64_t kAUTOMATIC{-1};
+    int64_t bytes{kDISABLE};
+    double percent{static_cast<double>(100.0)};
 
-    void parse(Arguments& arguments) override;
+    bool isDisabled()
+    {
+        return bytes == kDISABLE && percent == kDISABLE;
+    }
+};
 
-    static void help(std::ostream& out);
+class Options
+{
+public:
+    virtual ~Options() = default;
+    virtual void parse(Arguments& arguments) = 0;
 };
 
-struct UffInput : public Options
+class BaseModelOptions : public Options
 {
-    std::vector<std::pair<std::string, nvinfer1::Dims>> inputs;
-    bool NHWC{false};
+public:
+    ModelFormat format{ModelFormat::kANY};
+    std::string model;
 
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct ModelOptions : public Options
+class ModelOptions : public Options
 {
+public:
     BaseModelOptions baseModel;
     std::string prototxt;
     std::vector<std::string> outputs;
-    UffInput uffInputs;
 
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct BuildOptions : public Options
+constexpr nvinfer1::TempfileControlFlags getTempfileControlDefaults()
 {
-    int32_t maxBatch{maxBatchNotProvided};
+    using F = nvinfer1::TempfileControlFlag;
+    return (1U << static_cast<uint32_t>(F::kALLOW_TEMPORARY_FILES))
+        | (1U << static_cast<uint32_t>(F::kALLOW_IN_MEMORY_FILES));
+}
+
+class BuildOptions : public Options
+{
+public:
+    // Unit in MB.
     double workspace{-1.0};
+    // Unit in MB.
     double dlaSRAM{-1.0};
+    // Unit in MB.
     double dlaLocalDRAM{-1.0};
+    // Unit in MB.
     double dlaGlobalDRAM{-1.0};
-    int32_t minTiming{defaultMinTiming};
+    // Unit in KB.
+    double tacticSharedMem{-1.0};
     int32_t avgTiming{defaultAvgTiming};
+    size_t calibProfile{defaultOptProfileIndex};
     bool tf32{true};
     bool fp16{false};
+    bool bf16{false};
     bool int8{false};
+    bool fp8{false};
+    bool int4{false};
+    bool stronglyTyped{false};
     bool directIO{false};
     PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE};
     LayerPrecisions layerPrecisions;
     LayerOutputTypes layerOutputTypes;
+    LayerDeviceTypes layerDeviceTypes;
+    StringSet debugTensors;
+    StringSet debugTensorStates;
     bool safe{false};
-    bool consistency{false};
+    bool buildDLAStandalone{false};
+    bool allowGPUFallback{false};
     bool restricted{false};
+    bool skipInference{false};
     bool save{false};
     bool load{false};
     bool refittable{false};
+    bool stripWeights{false};
+    bool versionCompatible{false};
+    bool pluginInstanceNorm{false};
+    bool excludeLeanRuntime{false};
+    bool disableCompilationCache{false};
+    int32_t builderOptimizationLevel{defaultBuilderOptimizationLevel};
+    int32_t maxTactics{defaultMaxTactics};
     SparsityFlag sparsity{SparsityFlag::kDISABLE};
-#if (NV_TENSORRT_MAJOR > 7)
-	nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY};
-#else
-	nvinfer1::ProfilingVerbosity profilingVerbosity{ nvinfer1::ProfilingVerbosity::kDEFAULT };
-#endif
+    nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY};
     std::string engine;
     std::string calibration;
-    std::unordered_map<std::string, ShapeRange> shapes;
-    std::unordered_map<std::string, ShapeRange> shapesCalib;
+    using ShapeProfile = std::unordered_map<std::string, ShapeRange>;
+    std::vector<ShapeProfile> optProfiles;
+    ShapeProfile shapesCalib;
     std::vector<IOFormat> inputFormats;
     std::vector<IOFormat> outputFormats;
     nvinfer1::TacticSources enabledTactics{0};
     nvinfer1::TacticSources disabledTactics{0};
     TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL};
     std::string timingCacheFile{};
+    bool errorOnTimingCacheMiss{false};
+    // C++11 does not automatically generate hash function for enum class.
+    // Use int32_t to support C++11 compilers.
+    std::unordered_map<int32_t, bool> previewFeatures;
+    nvinfer1::HardwareCompatibilityLevel hardwareCompatibilityLevel{nvinfer1::HardwareCompatibilityLevel::kNONE};
+    nvinfer1::RuntimePlatform runtimePlatform{nvinfer1::RuntimePlatform::kSAME_AS_BUILD};
+    std::string tempdir{};
+    nvinfer1::TempfileControlFlags tempfileControls{getTempfileControlDefaults()};
+    RuntimeMode useRuntime{RuntimeMode::kFULL};
+    std::string leanDLLPath{};
+    int32_t maxAuxStreams{defaultMaxAuxStreams};
+    bool getPlanVersionOnly{false};
+
+    bool allowWeightStreaming{false};
+
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct SystemOptions : public Options
+class SystemOptions : public Options
 {
+public:
     int32_t device{defaultDevice};
     int32_t DLACore{-1};
-    bool fallback{false};
+    bool ignoreParsedPluginLibs{false};
     std::vector<std::string> plugins;
+    std::vector<std::string> setPluginsToSerialize;
+    std::vector<std::string> dynamicPlugins;
 
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct InferenceOptions : public Options
+class InferenceOptions : public Options
 {
+public:
     int32_t batch{batchNotProvided};
     int32_t iterations{defaultIterations};
-    int32_t streams{defaultStreams};
+    int32_t infStreams{defaultStreams};
+    int32_t optProfileIndex{defaultOptProfileIndex};
     float warmup{defaultWarmUp};
     float duration{defaultDuration};
     float sleep{defaultSleep};
     float idle{defaultIdle};
+    float persistentCacheRatio{defaultPersistentCacheRatio};
     bool overlap{true};
     bool skipTransfers{false};
     bool useManaged{false};
     bool spin{false};
     bool threads{false};
     bool graph{false};
-    bool skip{false};
     bool rerun{false};
     bool timeDeserialize{false};
     bool timeRefit{false};
+    bool setOptProfile{false};
     std::unordered_map<std::string, std::string> inputs;
-    std::unordered_map<std::string, std::vector<int32_t>> shapes;
+    using ShapeProfile = std::unordered_map<std::string, std::vector<int32_t>>;
+    ShapeProfile shapes;
+    nvinfer1::ProfilingVerbosity nvtxVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY};
+    MemoryAllocationStrategy memoryAllocationStrategy{MemoryAllocationStrategy::kSTATIC};
+    std::unordered_map<std::string, std::string> debugTensorFileNames;
+
+    WeightStreamingBudget weightStreamingBudget;
 
     void parse(Arguments& arguments) override;
 
     static void help(std::ostream& out);
 };
 
-struct ReportingOptions : public Options
+class ReportingOptions : public Options
 {
+public:
     bool verbose{false};
     int32_t avgs{defaultAvgRuns};
-    float percentile{defaultPercentile};
+    std::vector<float> percentiles{defaultPercentiles.begin(), defaultPercentiles.end()};
     bool refit{false};
     bool output{false};
+    bool dumpRawBindings{false};
     bool profile{false};
     bool layerInfo{false};
+    bool optProfileInfo{false};
     std::string exportTimes;
     std::string exportOutput;
     std::string exportProfile;
@@ -229,8 +346,9 @@ struct ReportingOptions : public Options
     static void help(std::ostream& out);
 };
 
-struct SafeBuilderOptions : public Options
+class SafeBuilderOptions : public Options
 {
+public:
     std::string serialized{};
     std::string onnxModelFile{};
     bool help{false};
@@ -238,18 +356,24 @@ struct SafeBuilderOptions : public Options
     std::vector<IOFormat> inputFormats;
     std::vector<IOFormat> outputFormats;
     bool int8{false};
+    bool fp8{false};
+    bool int4{false};
     std::string calibFile{};
     std::vector<std::string> plugins;
-    bool consistency{false};
     bool standard{false};
+    TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL};
+    std::string timingCacheFile{};
+    SparsityFlag sparsity{SparsityFlag::kDISABLE};
+    int32_t avgTiming{defaultAvgTiming};
 
     void parse(Arguments& arguments) override;
 
     static void printHelp(std::ostream& out);
 };
 
-struct AllOptions : public Options
+class AllOptions : public Options
 {
+public:
     ModelOptions model;
     BuildOptions build;
     SystemOptions system;
@@ -262,6 +386,20 @@ struct AllOptions : public Options
     static void help(std::ostream& out);
 };
 
+class TaskInferenceOptions : public Options
+{
+public:
+    std::string engine;
+    int32_t device{defaultDevice};
+    int32_t DLACore{-1};
+    int32_t batch{batchNotProvided};
+    bool graph{false};
+    float persistentCacheRatio{defaultPersistentCacheRatio};
+    void parse(Arguments& arguments) override;
+    static void help(std::ostream& out);
+};
+
+
 Arguments argsToArgumentsMap(int32_t argc, char* argv[]);
 
 bool parseHelp(Arguments& arguments);
@@ -272,8 +410,6 @@ void helpHelp(std::ostream& out);
 
 std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options);
 
-std::ostream& operator<<(std::ostream& os, const UffInput& input);
-
 std::ostream& operator<<(std::ostream& os, const IOFormat& format);
 
 std::ostream& operator<<(std::ostream& os, const ShapeRange& dims);
@@ -292,6 +428,10 @@ std::ostream& operator<<(std::ostream& os, const AllOptions& options);
 
 std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options);
 
+std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype);
+
+std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType);
+
 inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
 {
     for (int32_t i = 0; i < dims.nbDims; ++i)
@@ -329,13 +469,11 @@ inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole ro
         os << "Constant";
         break;
     }
-#if (NV_TENSORRT_MAJOR > 7)
     case nvinfer1::WeightsRole::kANY:
     {
         os << "Any";
         break;
     }
-#endif
     }
 
     return os;
diff --git a/src/Detector/tensorrt_yolo/common/sampleReporting.cpp b/src/Detector/tensorrt_yolo/common/sampleReporting.cpp
index a92938c5..e9dda6e0 100644
--- a/src/Detector/tensorrt_yolo/common/sampleReporting.cpp
+++ b/src/Detector/tensorrt_yolo/common/sampleReporting.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -26,6 +27,8 @@
 #include "sampleOptions.h"
 #include "sampleReporting.h"
 
+using namespace nvinfer1;
+
 namespace sample
 {
 
@@ -45,7 +48,7 @@ float findPercentile(float percentile, std::vector<InferenceTime> const& timings
     {
         return std::numeric_limits<float>::infinity();
     }
-    if (percentile < 0.0f || percentile > 100.0f)
+    if (percentile < 0.F || percentile > 100.F)
     {
         throw std::runtime_error("percentile is not in [0, 100]!");
     }
@@ -99,8 +102,26 @@ float findCoeffOfVariance(std::vector<InferenceTime> const& timings, T const& to
 
 inline InferenceTime traceToTiming(const InferenceTrace& a)
 {
-    return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart),
-        (a.d2hEnd - a.d2hStart), (a.d2hEnd - a.h2dStart));
+    return InferenceTime(
+        (a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), (a.d2hEnd - a.d2hStart));
+}
+
+inline std::string dimsToString(Dims const& shape)
+{
+    std::stringstream ss;
+
+    if (shape.nbDims == 0)
+    {
+        ss << "scalar";
+    }
+    else
+    {
+        for (int32_t i = 0; i < shape.nbDims; i++)
+        {
+            ss << shape.d[i] << (i != shape.nbDims - 1 ? "x" : "");
+        }
+    }
+    return ss.str();
 }
 
 } // namespace
@@ -113,29 +134,40 @@ void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTi
 
 void printTiming(std::vector<InferenceTime> const& timings, int32_t runsPerAvg, std::ostream& os)
 {
-    int32_t count = 0;
+    int64_t count = 0;
     InferenceTime sum;
 
     os << std::endl;
     os << "=== Trace details ===" << std::endl;
     os << "Trace averages of " << runsPerAvg << " runs:" << std::endl;
-    for (auto const& t : timings)
+
+    // Show only the first N lines and the last N lines, where N = kTIMING_PRINT_THRESHOLD.
+    constexpr int64_t kTIMING_PRINT_THRESHOLD{200};
+    int64_t const maxNbTimings{kTIMING_PRINT_THRESHOLD * runsPerAvg};
+
+    for (int64_t idx = 0, size = timings.size(); idx < size; ++idx)
     {
-        sum += t;
+        // Omit some latency printing to avoid very long logs.
+        if (size > 2 * maxNbTimings && idx == maxNbTimings)
+        {
+            os << "... Omitting " << (size - 2 * maxNbTimings) << " lines" << std::endl;
+            idx = size - kTIMING_PRINT_THRESHOLD * runsPerAvg - 1;
+        }
+
+        sum += timings[idx];
 
         if (++count == runsPerAvg)
         {
             // clang-format off
             os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg
-               << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg
-               << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl;
+               << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (enqueue " << sum.enq / runsPerAvg
+               << " ms)" << std::endl;
             // clang-format on
             count = 0;
             sum.enq = 0;
             sum.h2d = 0;
             sum.compute = 0;
             sum.d2h = 0;
-            sum.e2e = 0;
         }
     }
 }
@@ -166,14 +198,10 @@ void printMetricExplanations(std::ostream& os)
     os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a "
           "single query."
        << std::endl;
-    os << "End-to-End Host Latency: the duration from when the H2D of a query is called to when the D2H of the same "
-          "query is completed, which includes the latency to wait for the completion of the previous query. This is "
-          "the latency of a query if multiple queries are enqueued consecutively."
-       << std::endl;
 }
 
 PerformanceResult getPerformanceResult(std::vector<InferenceTime> const& timings,
-    std::function<float(InferenceTime const&)> metricGetter, float percentile)
+    std::function<float(InferenceTime const&)> metricGetter, std::vector<float> const& percentiles)
 {
     auto const metricComparator
         = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); };
@@ -183,40 +211,44 @@ PerformanceResult getPerformanceResult(std::vector<InferenceTime> const& timings
     PerformanceResult result;
     result.min = metricGetter(newTimings.front());
     result.max = metricGetter(newTimings.back());
-    result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size();
+    result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0F, metricAccumulator) / newTimings.size();
     result.median = findMedian(newTimings, metricGetter);
-    result.percentile = findPercentile(percentile, newTimings, metricGetter);
+    for (auto percentile : percentiles)
+    {
+        result.percentiles.emplace_back(findPercentile(percentile, newTimings, metricGetter));
+    }
     result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean);
     return result;
 }
 
-void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, float percentile, int32_t batchSize,
-    std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
+void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, std::vector<float> const& percentiles,
+    int32_t batchSize, int32_t infStreams, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
 {
     float const throughput = batchSize * timings.size() / walltimeMs * 1000;
 
     auto const getLatency = [](InferenceTime const& t) { return t.latency(); };
-    auto const latencyResult = getPerformanceResult(timings, getLatency, percentile);
-
-    auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; };
-    auto const e2eLatencyResult = getPerformanceResult(timings, getEndToEnd, percentile);
+    auto const latencyResult = getPerformanceResult(timings, getLatency, percentiles);
 
     auto const getEnqueue = [](InferenceTime const& t) { return t.enq; };
-    auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentile);
+    auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentiles);
 
     auto const getH2d = [](InferenceTime const& t) { return t.h2d; };
-    auto const h2dResult = getPerformanceResult(timings, getH2d, percentile);
+    auto const h2dResult = getPerformanceResult(timings, getH2d, percentiles);
 
     auto const getCompute = [](InferenceTime const& t) { return t.compute; };
-    auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentile);
+    auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentiles);
 
     auto const getD2h = [](InferenceTime const& t) { return t.d2h; };
-    auto const d2hResult = getPerformanceResult(timings, getD2h, percentile);
+    auto const d2hResult = getPerformanceResult(timings, getD2h, percentiles);
 
-    auto const toPerfString = [percentile](const PerformanceResult& r) {
+    auto const toPerfString = [&](const PerformanceResult& r) {
         std::stringstream s;
         s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, "
-          << "median = " << r.median << " ms, percentile(" << percentile << "%) = " << r.percentile << " ms";
+          << "median = " << r.median << " ms";
+        for (int32_t i = 0, n = percentiles.size(); i < n; ++i)
+        {
+            s << ", percentile(" << percentiles[i] << "%) = " << r.percentiles[i] << " ms";
+        }
         return s.str();
     };
 
@@ -224,7 +256,6 @@ void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, fl
     osInfo << "=== Performance summary ===" << std::endl;
     osInfo << "Throughput: " << throughput << " qps" << std::endl;
     osInfo << "Latency: " << toPerfString(latencyResult) << std::endl;
-    osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) << std::endl;
     osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl;
     osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl;
     osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl;
@@ -268,6 +299,13 @@ void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, fl
                   << "stability." << std::endl;
     }
 
+    // Report warnings if multiple inference streams are used.
+    if (infStreams > 1)
+    {
+        osWarning << "* Multiple inference streams are used. Latencies may not be accurate since inferences may run in "
+                  << "  parallel. Please use \"Throughput\" as the performance metric instead." << std::endl;
+    }
+
     // Explain what the metrics mean.
     osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl;
     printMetricExplanations(osVerbose);
@@ -275,27 +313,28 @@ void printEpilog(std::vector<InferenceTime> const& timings, float walltimeMs, fl
     osInfo << std::endl;
 }
 
-void printPerformanceReport(std::vector<InferenceTrace> const& trace, const ReportingOptions& reporting, float warmupMs,
-    int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
+void printPerformanceReport(std::vector<InferenceTrace> const& trace, ReportingOptions const& reportingOpts,
+    InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose)
 {
+    int32_t batchSize = infOpts.batch;
+    float const warmupMs = infOpts.warmup;
     auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; };
     auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup);
     int32_t const warmups = noWarmup - trace.begin();
     float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart;
-    // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch
-    // when explicit batch used, batchSize = options.inference.batch = 0
     // treat inference with explicit batch as a single query and report the throughput
     batchSize = batchSize ? batchSize : 1;
     printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo);
 
     std::vector<InferenceTime> timings(trace.size() - warmups);
     std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming);
-    printTiming(timings, reporting.avgs, osInfo);
-    printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, osWarning, osVerbose);
+    printTiming(timings, reportingOpts.avgs, osInfo);
+    printEpilog(
+        timings, benchTime, reportingOpts.percentiles, batchSize, infOpts.infStreams, osInfo, osWarning, osVerbose);
 
-    if (!reporting.exportTimes.empty())
+    if (!reportingOpts.exportTimes.empty())
     {
-        exportJSONTrace(trace, reporting.exportTimes);
+        exportJSONTrace(trace, reportingOpts.exportTimes, warmups);
     }
 }
 
@@ -303,15 +342,16 @@ void printPerformanceReport(std::vector<InferenceTrace> const& trace, const Repo
 //! [ value, ...]
 //! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time,
 //!             "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time,
-//!             "d2h" : time, "latency" : time, "end to end" : time }
+//!             "d2h" : time, "latency" : time }
 //!
-void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const& fileName)
+void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const& fileName, int32_t const nbWarmups)
 {
     std::ofstream os(fileName, std::ofstream::trunc);
     os << "[" << std::endl;
     char const* sep = "  ";
-    for (auto const& t : trace)
+    for (auto iter = trace.begin() + nbWarmups; iter < trace.end(); ++iter)
     {
+        auto const& t = *iter;
         InferenceTime const it(traceToTiming(t));
         os << sep << "{ ";
         sep = ", ";
@@ -321,8 +361,8 @@ void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const
            << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep
            << "\"startD2hMs\" : "     << t.d2hStart     << sep << "\"endD2hMs\" : "     << t.d2hEnd     << sep
            << "\"h2dMs\" : "          << it.h2d         << sep << "\"computeMs\" : "    << it.compute   << sep
-           << "\"d2hMs\" : "          << it.d2h         << sep << "\"latencyMs\" : "    << it.latency() << sep
-           << "\"endToEndMs\" : "     << it.e2e         << " }"                                         << std::endl;
+           << "\"d2hMs\" : "          << it.d2h         << sep << "\"latencyMs\" : "    << it.latency() << " }"
+           << std::endl;
         // clang-format on
     }
     os << "]" << std::endl;
@@ -346,42 +386,49 @@ void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept
         }
     }
 
-    mIterator->timeMs += timeMs;
+    mIterator->timeMs.push_back(timeMs);
     ++mIterator;
 }
 
 void Profiler::print(std::ostream& os) const noexcept
 {
-    std::string const nameHdr("Layer");
-    std::string const timeHdr("   Time (ms)");
-    std::string const avgHdr("   Avg. Time (ms)");
-    std::string const percentageHdr("   Time %");
+    std::string const nameHdr("   Layer");
+    std::string const timeHdr("   Time(ms)");
+    std::string const avgHdr("     Avg.(ms)");
+    std::string const medHdr("   Median(ms)");
+    std::string const percentageHdr("   Time(%)");
 
     float const totalTimeMs = getTotalTime();
 
-    auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); };
-    auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer);
-    auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size());
     auto const timeLength = timeHdr.size();
     auto const avgLength = avgHdr.size();
+    auto const medLength = medHdr.size();
     auto const percentageLength = percentageHdr.size();
 
     os << std::endl
        << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl
-       << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl;
+       << timeHdr << avgHdr << medHdr << percentageHdr << nameHdr << std::endl;
 
     for (auto const& p : mLayers)
     {
+        if (p.timeMs.empty() || getTotalTime(p) == 0.F)
+        {
+            // there is no point to print profiling for layer that didn't run at all
+            continue;
+        }
         // clang-format off
-        os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs
-           << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount
-           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100
-           << std::endl;
+        os << std::setw(timeLength) << std::fixed << std::setprecision(2) << getTotalTime(p)
+           << std::setw(avgLength) << std::fixed << std::setprecision(4) << getAvgTime(p)
+           << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime(p)
+           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << getTotalTime(p) / totalTimeMs * 100
+           << "   " << p.name << std::endl;
     }
     {
-        os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2)
+        os << std::setw(timeLength) << std::fixed << std::setprecision(2)
            << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount
-           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl;
+           << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime()
+           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0
+           << "   Total" << std::endl;
         // clang-format on
     }
     os << std::endl;
@@ -397,10 +444,11 @@ void Profiler::exportJSONProfile(std::string const& fileName) const noexcept
     for (auto const& l : mLayers)
     {
         // clang-format off
-        os << ", {" << " \"name\" : \""      << l.name << "\""
-                       ", \"timeMs\" : "     << l.timeMs
-           <<          ", \"averageMs\" : "  << l.timeMs / mUpdatesCount
-           <<          ", \"percentage\" : " << l.timeMs / totalTimeMs * 100
+        os << ", {" << R"( "name" : ")"      << l.name << R"(")"
+                       R"(, "timeMs" : )"     << getTotalTime(l)
+           <<          R"(, "averageMs" : )"  << getAvgTime(l)
+           <<          R"(, "medianMs" : )"  << getMedianTime(l)
+           <<          R"(, "percentage" : )" << getTotalTime(l) / totalTimeMs * 100
            << " }"  << std::endl;
         // clang-format on
     }
@@ -415,8 +463,13 @@ void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bind
 
 void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os)
 {
-    os << "Output Tensors:" << std::endl;
-    bindings.dumpOutputs(context, os);
+    auto isOutput = [](Binding const& b) { return !b.isInput; };
+    bindings.dumpBindings(context, isOutput, os);
+}
+
+void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os)
+{
+    bindings.dumpRawBindingToFiles(context, os);
 }
 
 void exportJSONOutput(
@@ -429,10 +482,10 @@ void exportJSONOutput(
     for (auto const& binding : output)
     {
         // clang-format off
-        os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl;
+        os << sep << R"({ "name" : ")" << binding.first << "\"" << std::endl;
         sep = ", ";
-        os << "  " << sep << "\"dimensions\" : \"";
-        bindings.dumpBindingDimensions(binding.second, context, os);
+        os << "  " << sep << R"("dimensions" : ")";
+        bindings.dumpBindingDimensions(binding.first, context, os);
         os << "\"" << std::endl;
         os << "  " << sep << "\"values\" : [ ";
         bindings.dumpBindingValues(context, binding.second, os, sep, batch);
@@ -442,4 +495,115 @@ void exportJSONOutput(
     os << "]" << std::endl;
 }
 
+void exportJSONOutput(
+    nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch);
+
+void printLayerInfo(
+    ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context)
+{
+    if (reporting.layerInfo)
+    {
+        sample::gLogInfo << "Layer Information:" << std::endl;
+        sample::gLogInfo << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kONELINE)
+                         << std::flush;
+    }
+    if (!reporting.exportLayerInfo.empty())
+    {
+        std::ofstream os(reporting.exportLayerInfo, std::ofstream::trunc);
+        os << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kJSON) << std::flush;
+    }
+}
+
+void printOptimizationProfileInfo(ReportingOptions const& reporting, nvinfer1::ICudaEngine const* engine)
+{
+    if (reporting.optProfileInfo)
+    {
+        sample::gLogInfo << "Optimization Profile Information:" << std::endl;
+        for (int32_t i = 0; i < engine->getNbOptimizationProfiles(); i++)
+        {
+            for (int32_t j = 0, e = engine->getNbIOTensors(); j < e; j++)
+            {
+                auto const tensorName = engine->getIOTensorName(j);
+
+                if (engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT)
+                {
+                    auto tensorMinShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kMIN);
+                    auto tensorOptShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kOPT);
+                    auto tensorMaxShape = engine->getProfileShape(tensorName, i, nvinfer1::OptProfileSelector::kMAX);
+
+                    sample::gLogInfo << "Model input " << tensorName << " (profile " << i << "): "
+                                     << "min=" << dimsToString(tensorMinShape)
+                                     << ", opt=" << dimsToString(tensorOptShape)
+                                     << ", max=" << dimsToString(tensorMaxShape) << std::endl;
+                }
+            }
+        }
+    }
+}
+
+void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv)
+{
+    if (reporting.profile)
+    {
+        iEnv.profiler->print(sample::gLogInfo);
+    }
+    if (!reporting.exportProfile.empty())
+    {
+        iEnv.profiler->exportJSONProfile(reporting.exportProfile);
+    }
+
+    // Print an warning about total per-layer latency when auxiliary streams are used.
+    if (!iEnv.safe && (reporting.profile || !reporting.exportProfile.empty()))
+    {
+        int32_t const nbAuxStreams = iEnv.engine.get()->getNbAuxStreams();
+        if (nbAuxStreams > 0)
+        {
+            sample::gLogWarning << "The engine uses " << nbAuxStreams << " auxiliary streams, so the \"Total\" latency "
+                                << "may not be accurate because some layers may have run in parallel!" << std::endl;
+        }
+    }
+}
+
+namespace details
+{
+void dump(std::unique_ptr<nvinfer1::IExecutionContext> const& context, std::unique_ptr<Bindings> const& binding,
+    ReportingOptions const& reporting, int32_t batch)
+{
+    if (!context)
+    {
+        sample::gLogError << "Empty context! Skip printing outputs." << std::endl;
+        return;
+    }
+    if (reporting.output)
+    {
+        dumpOutputs(*context, *binding, sample::gLogInfo);
+    }
+    if (reporting.dumpRawBindings)
+    {
+        dumpRawBindingsToFiles(*context, *binding, sample::gLogInfo);
+    }
+    if (!reporting.exportOutput.empty())
+    {
+        exportJSONOutput(*context, *binding, reporting.exportOutput, batch);
+    }
+}
+} // namespace details
+
+void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch)
+{
+    auto const& binding = iEnv.bindings.at(0);
+    if (!binding)
+    {
+        sample::gLogError << "Empty bindings! Skip printing outputs." << std::endl;
+        return;
+    }
+    if (iEnv.safe)
+    {
+        sample::gLogError << "Safe inferernce is not supported!" << std::endl;
+        return;
+    }
+    auto const& context = iEnv.contexts.at(0);
+    details::dump(context, binding, reporting, batch);
+}
+
 } // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleReporting.h b/src/Detector/tensorrt_yolo/common/sampleReporting.h
index 5f730987..922ef3c8 100644
--- a/src/Detector/tensorrt_yolo/common/sampleReporting.h
+++ b/src/Detector/tensorrt_yolo/common/sampleReporting.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -19,27 +20,26 @@
 
 #include <functional>
 #include <iostream>
-
-#include "NvInfer.h"
+#include <numeric>
 
 #include "sampleOptions.h"
-#include "sampleUtils.h"
 
 namespace sample
 {
 
+class Bindings;
+
 //!
 //! \struct InferenceTime
 //! \brief Measurement times in milliseconds
 //!
 struct InferenceTime
 {
-    InferenceTime(float q, float i, float c, float o, float e)
+    InferenceTime(float q, float i, float c, float o)
         : enq(q)
         , h2d(i)
         , compute(c)
         , d2h(o)
-        , e2e(e)
     {
     }
 
@@ -54,7 +54,6 @@ struct InferenceTime
     float h2d{0};     // Host to Device
     float compute{0}; // Compute
     float d2h{0};     // Device to Host
-    float e2e{0};     // end to end
 
     // ideal latency
     float latency() const
@@ -102,7 +101,7 @@ struct InferenceTrace
 
 inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b)
 {
-    return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h, a.e2e + b.e2e);
+    return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h);
 }
 
 inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b)
@@ -116,12 +115,12 @@ inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b)
 //!
 struct PerformanceResult
 {
-    float min{0};
-    float max{0};
-    float mean{0};
-    float median{0};
-    float percentile{0};
-    float coeffVar{0}; // coefficient of variation
+    float min{0.F};
+    float max{0.F};
+    float mean{0.F};
+    float median{0.F};
+    std::vector<float> percentiles;
+    float coeffVar{0.F}; // coefficient of variation
 };
 
 //!
@@ -137,14 +136,14 @@ void printTiming(std::vector<InferenceTime> const& timings, int32_t runsPerAvg,
 //!
 //! \brief Print the performance summary of a trace
 //!
-void printEpilog(std::vector<InferenceTime> const& timings, float percentile, int32_t batchSize, std::ostream& osInfo,
-    std::ostream& osWarning, std::ostream& osVerbose);
+void printEpilog(std::vector<InferenceTime> const& timings, std::vector<float> const& percentiles, int32_t batchSize,
+    std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose);
 
 //!
 //! \brief Get the result of a specific performance metric from a trace
 //!
 PerformanceResult getPerformanceResult(std::vector<InferenceTime> const& timings,
-    std::function<float(InferenceTime const&)> metricGetter, float percentile);
+    std::function<float(InferenceTime const&)> metricGetter, std::vector<float> const& percentiles);
 
 //!
 //! \brief Print the explanations of the performance metrics printed in printEpilog() function.
@@ -154,13 +153,14 @@ void printMetricExplanations(std::ostream& os);
 //!
 //! \brief Print and summarize a timing trace
 //!
-void printPerformanceReport(std::vector<InferenceTrace> const& trace, ReportingOptions const& reporting, float warmupMs,
-    int32_t batchSize, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose);
+void printPerformanceReport(std::vector<InferenceTrace> const& trace, ReportingOptions const& reportingOpts,
+    InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose);
 
 //!
 //! \brief Export a timing trace to JSON file
 //!
-void exportJSONTrace(std::vector<InferenceTrace> const& trace, std::string const& fileName);
+void exportJSONTrace(
+    std::vector<InferenceTrace> const& InferenceTime, std::string const& fileName, int32_t const nbWarmups);
 
 //!
 //! \brief Print input tensors to stream
@@ -172,6 +172,8 @@ void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bind
 //!
 void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os);
 
+void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os);
+
 //!
 //! \brief Export output tensors to JSON file
 //!
@@ -185,7 +187,7 @@ void exportJSONOutput(
 struct LayerProfile
 {
     std::string name;
-    float timeMs{0};
+    std::vector<float> timeMs;
 };
 
 //!
@@ -208,8 +210,58 @@ class Profiler : public nvinfer1::IProfiler
 private:
     float getTotalTime() const noexcept
     {
-        auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs; };
-        return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime);
+        auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) {
+            return accumulator + std::accumulate(lp.timeMs.begin(), lp.timeMs.end(), 0.F, std::plus<float>());
+        };
+        return std::accumulate(mLayers.begin(), mLayers.end(), 0.0F, plusLayerTime);
+    }
+
+    float getMedianTime() const noexcept
+    {
+        if (mLayers.empty())
+        {
+            return 0.F;
+        }
+        std::vector<float> totalTime;
+        for (size_t run = 0; run < mLayers[0].timeMs.size(); ++run)
+        {
+            auto const layerTime
+                = [&run](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs[run]; };
+            auto t = std::accumulate(mLayers.begin(), mLayers.end(), 0.F, layerTime);
+            totalTime.push_back(t);
+        }
+        return median(totalTime);
+    }
+
+    float getMedianTime(LayerProfile const& p) const noexcept
+    {
+        return median(p.timeMs);
+    }
+
+    static float median(std::vector<float> vals)
+    {
+        if (vals.empty())
+        {
+            return 0.F;
+        }
+        std::sort(vals.begin(), vals.end());
+        if (vals.size() % 2U == 1U)
+        {
+            return vals[vals.size() / 2U];
+        }
+        return (vals[vals.size() / 2U - 1U] + vals[vals.size() / 2U]) * 0.5F;
+    }
+
+    //! return the total runtime of given layer profile
+    float getTotalTime(LayerProfile const& p) const noexcept
+    {
+        auto const& vals = p.timeMs;
+        return std::accumulate(vals.begin(), vals.end(), 0.F, std::plus<float>());
+    }
+
+    float getAvgTime(LayerProfile const& p) const noexcept
+    {
+        return getTotalTime(p) / p.timeMs.size();
     }
 
     std::vector<LayerProfile> mLayers;
@@ -217,6 +269,30 @@ class Profiler : public nvinfer1::IProfiler
     int32_t mUpdatesCount{0};
 };
 
+//!
+//! \brief Print layer info to logger or export it to output JSON file.
+//!
+void printLayerInfo(
+    ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context);
+
+//!
+//! \brief Print optimization profile info to logger.
+//!
+void printOptimizationProfileInfo(ReportingOptions const& reporting, nvinfer1::ICudaEngine const* engine);
+
+//! Forward declaration.
+struct InferenceEnvironment;
+
+//!
+//! \brief Print per-layer perf profile data to logger or export it to output JSON file.
+//!
+void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv);
+
+//!
+//! \brief Print binding output values to logger or export them to output JSON file.
+//!
+void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch);
+
 } // namespace sample
 
 #endif // TRT_SAMPLE_REPORTING_H
diff --git a/src/Detector/tensorrt_yolo/common/sampleUtils.cpp b/src/Detector/tensorrt_yolo/common/sampleUtils.cpp
new file mode 100644
index 00000000..689e5857
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/sampleUtils.cpp
@@ -0,0 +1,587 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sampleUtils.h"
+#include "bfloat16.h"
+#include "half.h"
+
+using namespace nvinfer1;
+
+namespace sample
+{
+
+size_t dataTypeSize(nvinfer1::DataType dataType)
+{
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kINT64: return 8U;
+    case nvinfer1::DataType::kINT32:
+    case nvinfer1::DataType::kFLOAT: return 4U;
+    case nvinfer1::DataType::kBF16:
+    case nvinfer1::DataType::kHALF: return 2U;
+    case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kUINT8:
+    case nvinfer1::DataType::kINT8:
+    case nvinfer1::DataType::kFP8: return 1U;
+    case nvinfer1::DataType::kINT4:
+        ASSERT(false && "Element size is not implemented for sub-byte data-types.");
+    }
+    return 0;
+}
+
+int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch)
+{
+    int64_t maxNbElems = 1;
+    for (int32_t i = 0; i < dims.nbDims; ++i)
+    {
+        // Get effective length of axis.
+        int64_t d = dims.d[i];
+        // Any dimension is 0, it is an empty tensor.
+        if (d == 0)
+        {
+            return 0;
+        }
+        if (i == vecDim)
+        {
+            d = samplesCommon::divUp(d, comps);
+        }
+        maxNbElems = std::max(maxNbElems, d * strides.d[i]);
+    }
+    return maxNbElems * batch * (vecDim < 0 ? 1 : comps);
+}
+
+nvinfer1::Dims toDims(std::vector<int32_t> const& vec)
+{
+    int32_t limit = static_cast<int32_t>(nvinfer1::Dims::MAX_DIMS);
+    if (static_cast<int32_t>(vec.size()) > limit)
+    {
+        sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl;
+    }
+    // Pick first nvinfer1::Dims::MAX_DIMS elements
+    nvinfer1::Dims dims{std::min(static_cast<int32_t>(vec.size()), limit), {}};
+    std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
+    return dims;
+}
+
+void loadFromFile(std::string const& fileName, char* dst, size_t size)
+{
+    ASSERT(dst);
+
+    std::ifstream file(fileName, std::ios::in | std::ios::binary);
+    if (file.is_open())
+    {
+        file.seekg(0, std::ios::end);
+        int64_t fileSize = static_cast<int64_t>(file.tellg());
+        // Due to change from int32_t to int64_t VC engines created with earlier versions
+        // may expect input of the half of the size
+        if (fileSize != static_cast<int64_t>(size) && fileSize != static_cast<int64_t>(size * 2))
+        {
+            std::ostringstream msg;
+            msg << "Unexpected file size for input file: " << fileName << ". Note: Input binding size is: " << size
+                << " bytes but the file size is " << fileSize
+                << " bytes. Double check the size and datatype of the provided data.";
+            throw std::invalid_argument(msg.str());
+        }
+        // Move file pointer back to the beginning after reading file size.
+        file.seekg(0, std::ios::beg);
+        file.read(dst, size);
+        size_t const nbBytesRead = file.gcount();
+        file.close();
+        if (nbBytesRead != size)
+        {
+            std::ostringstream msg;
+            msg << "Unexpected file size for input file: " << fileName << ". Note: Expected: " << size
+                << " bytes but only read: " << nbBytesRead << " bytes";
+            throw std::invalid_argument(msg.str());
+        }
+    }
+    else
+    {
+        std::ostringstream msg;
+        msg << "Cannot open file " << fileName << "!";
+        throw std::invalid_argument(msg.str());
+    }
+}
+
+std::vector<std::string> splitToStringVec(std::string const& s, char separator, int64_t maxSplit)
+{
+    std::vector<std::string> splitted;
+
+    for (size_t start = 0; start < s.length();)
+    {
+        // If maxSplit is specified and we have reached maxSplit, emplace back the rest of the string and break the
+        // loop.
+        if (maxSplit >= 0 && static_cast<int64_t>(splitted.size()) == maxSplit)
+        {
+            splitted.emplace_back(s.substr(start, s.length() - start));
+            break;
+        }
+
+        size_t separatorIndex = s.find(separator, start);
+        if (separatorIndex == std::string::npos)
+        {
+            separatorIndex = s.length();
+        }
+        splitted.emplace_back(s.substr(start, separatorIndex - start));
+
+        // If the separator is the last character, then we should push an empty string at the end.
+        if (separatorIndex == s.length() - 1)
+        {
+            splitted.emplace_back("");
+        }
+
+        start = separatorIndex + 1;
+    }
+
+    return splitted;
+}
+
+bool broadcastIOFormats(std::vector<IOFormat> const& formats, size_t nbBindings, bool isInput /*= true*/)
+{
+    bool broadcast = formats.size() == 1;
+    bool validFormatsCount = broadcast || (formats.size() == nbBindings);
+    if (!formats.empty() && !validFormatsCount)
+    {
+        if (isInput)
+        {
+            throw std::invalid_argument(
+                "The number of inputIOFormats must match network's inputs or be one for broadcasting.");
+        }
+
+        throw std::invalid_argument(
+            "The number of outputIOFormats must match network's outputs or be one for broadcasting.");
+    }
+    return broadcast;
+}
+
+void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights)
+{
+    using TensorToLayer = std::unordered_map<nvinfer1::ITensor*, nvinfer1::ILayer*>;
+    using LayerToTensor = std::unordered_map<nvinfer1::ILayer*, nvinfer1::ITensor*>;
+
+    // 1. Collect layers and tensors information from the network.
+    TensorToLayer matmulI2L;
+    TensorToLayer constO2L;
+    TensorToLayer shuffleI2L;
+    LayerToTensor shuffleL2O;
+    auto collectMappingInfo = [&](int32_t const idx)
+    {
+        ILayer* l = network.getLayer(idx);
+        switch (l->getType())
+        {
+        case nvinfer1::LayerType::kMATRIX_MULTIPLY:
+        {
+            // assume weights on the second input.
+            matmulI2L.insert({l->getInput(1), l});
+            break;
+        }
+        case nvinfer1::LayerType::kCONSTANT:
+        {
+            DataType const dtype = static_cast<nvinfer1::IConstantLayer*>(l)->getWeights().type;
+            if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF)
+            {
+                // Sparsify float only.
+                constO2L.insert({l->getOutput(0), l});
+            }
+            break;
+        }
+        case nvinfer1::LayerType::kSHUFFLE:
+        {
+            shuffleI2L.insert({l->getInput(0), l});
+            shuffleL2O.insert({l, l->getOutput(0)});
+            break;
+        }
+        default: break;
+        }
+    };
+    int32_t const nbLayers = network.getNbLayers();
+    for (int32_t i = 0; i < nbLayers; ++i)
+    {
+        collectMappingInfo(i);
+    }
+    if (matmulI2L.size() == 0 || constO2L.size() == 0)
+    {
+        // No MatrixMultiply or Constant layer found, no weights to sparsify.
+        return;
+    }
+
+    // Helper for analysis
+    auto isTranspose
+        = [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); };
+    auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; };
+    auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool
+    {
+        for (int32_t i = 0; i < dims.nbDims; ++i)
+        {
+            if (dims.d[i] != i || dims.d[i] != -1)
+            {
+                return false;
+            }
+        }
+        return true;
+    };
+    auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) -> ITensor*
+    {
+        while (shuffleI2L.find(t) != shuffleI2L.end())
+        {
+            nvinfer1::IShuffleLayer* s = static_cast<nvinfer1::IShuffleLayer*>(shuffleI2L.at(t));
+            if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions())
+                || !isIdenticalReshape(s->getReshapeDimensions()))
+            {
+                break;
+            }
+
+            if (isTranspose(s->getFirstTranspose()))
+            {
+                needTranspose = !needTranspose;
+            }
+            if (isTranspose(s->getSecondTranspose()))
+            {
+                needTranspose = !needTranspose;
+            }
+
+            t = shuffleL2O.at(s);
+        }
+        return t;
+    };
+
+    // 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose
+    std::unordered_map<nvinfer1::IConstantLayer*, bool> constantLayerToSparse;
+    for (auto& o2l : constO2L)
+    {
+        // If need to transpose the weights of the Constant layer.
+        // Need to transpose by default due to semantic difference.
+        bool needTranspose{true};
+        ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose);
+        if (matmulI2L.find(t) == matmulI2L.end())
+        {
+            continue;
+        }
+
+        // check MatMul params...
+        IMatrixMultiplyLayer* mm = static_cast<nvinfer1::IMatrixMultiplyLayer*>(matmulI2L.at(t));
+        bool const twoInputs = mm->getNbInputs() == 2;
+        bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions());
+        bool const isSimple = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE
+            && mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR;
+        if (!(twoInputs && all2D && isSimple))
+        {
+            continue;
+        }
+        if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE)
+        {
+            needTranspose = !needTranspose;
+        }
+
+        constantLayerToSparse.insert({static_cast<IConstantLayer*>(o2l.second), needTranspose});
+    }
+
+    // 3. Finally, sparsify the weights
+    auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose)
+    {
+        Dims dims = layer->getOutput(0)->getDimensions();
+        ASSERT(dims.nbDims == 2);
+        int32_t const idxN = needTranspose ? 1 : 0;
+        int32_t const n = dims.d[idxN];
+        int32_t const k = dims.d[1 - idxN];
+        sparseWeights.emplace_back();
+        std::vector<int8_t>& spw = sparseWeights.back();
+        Weights w = layer->getWeights();
+        DataType const dtype = w.type;
+        ASSERT(dtype == nvinfer1::DataType::kFLOAT
+            || dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored.
+
+        if (needTranspose)
+        {
+            if (dtype == nvinfer1::DataType::kFLOAT)
+            {
+                spw.resize(w.count * sizeof(float));
+                transpose2DWeights<float>(spw.data(), w.values, k, n);
+            }
+            else if (dtype == nvinfer1::DataType::kHALF)
+            {
+                spw.resize(w.count * sizeof(half_float::half));
+                transpose2DWeights<half_float::half>(spw.data(), w.values, k, n);
+            }
+
+            w.values = spw.data();
+            std::vector<int8_t> tmpW;
+            sparsify(w, n, 1, tmpW);
+
+            if (dtype == nvinfer1::DataType::kFLOAT)
+            {
+                transpose2DWeights<float>(spw.data(), tmpW.data(), n, k);
+            }
+            else if (dtype == nvinfer1::DataType::kHALF)
+            {
+                transpose2DWeights<half_float::half>(spw.data(), tmpW.data(), n, k);
+            }
+        }
+        else
+        {
+            sparsify(w, n, 1, spw);
+        }
+
+        w.values = spw.data();
+        layer->setWeights(w);
+    };
+    for (auto& l : constantLayerToSparse)
+    {
+        sparsifyConstantWeights(l.first, l.second);
+    }
+}
+
+template <typename L>
+void setSparseWeights(L& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
+{
+    auto weights = l.getKernelWeights();
+    sparsify(weights, k, trs, sparseWeights);
+    weights.values = sparseWeights.data();
+    l.setKernelWeights(weights);
+}
+
+// Explicit instantiation
+template void setSparseWeights<IConvolutionLayer>(
+    IConvolutionLayer& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
+
+void sparsify(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights)
+{
+    for (int32_t l = 0; l < network.getNbLayers(); ++l)
+    {
+        auto* layer = network.getLayer(l);
+        auto const t = layer->getType();
+        if (t == nvinfer1::LayerType::kCONVOLUTION)
+        {
+            auto& conv = *static_cast<IConvolutionLayer*>(layer);
+            auto const& dims = conv.getKernelSizeNd();
+            ASSERT(dims.nbDims == 2 || dims.nbDims == 3);
+            auto const k = conv.getNbOutputMaps();
+            auto const trs = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int32_t>());
+            sparseWeights.emplace_back();
+            setSparseWeights(conv, k, trs, sparseWeights.back());
+        }
+    }
+
+    sparsifyMatMulKernelWeights(network, sparseWeights);
+    sample::gLogVerbose << "--sparsity=force pruned " << sparseWeights.size() << " weights to be sparsity pattern." << std::endl;
+    sample::gLogVerbose << "--sparsity=force has been deprecated. Please use <polygraphy surgeon prune> to rewrite the weights to a sparsity pattern and then run with --sparsity=enable" << std::endl;
+}
+
+void sparsify(Weights const& weights, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
+{
+    switch (weights.type)
+    {
+    case DataType::kFLOAT:
+        sparsify(static_cast<float const*>(weights.values), weights.count, k, trs, sparseWeights);
+        break;
+    case DataType::kHALF:
+        sparsify(static_cast<half_float::half const*>(weights.values), weights.count, k, trs, sparseWeights);
+        break;
+    case DataType::kBF16:
+        sparsify(static_cast<BFloat16 const*>(weights.values), weights.count, k, trs, sparseWeights);
+        break;
+    case DataType::kINT8:
+    case DataType::kINT32:
+    case DataType::kUINT8:
+    case DataType::kBOOL:
+    case DataType::kINT4:
+    case DataType::kFP8:
+    case DataType::kINT64:
+        ASSERT(false && "Unsupported data type");
+    }
+}
+
+template <typename T>
+void print(std::ostream& os, T v)
+{
+    os << v;
+}
+
+void print(std::ostream& os, int8_t v)
+{
+    os << static_cast<int32_t>(v);
+}
+
+void print(std::ostream& os, __half v)
+{
+    os << static_cast<float>(v);
+}
+
+template <typename T>
+void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv)
+{
+    auto const vol = volume(dims);
+    T const* typedBuffer = static_cast<T const*>(buffer);
+    std::string sep;
+    for (int64_t v = 0; v < vol; ++v)
+    {
+        int64_t curV = v;
+        int32_t dataOffset = 0;
+        for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex)
+        {
+            int32_t dimVal = curV % dims.d[dimIndex];
+            if (dimIndex == vectorDim)
+            {
+                dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv;
+            }
+            else
+            {
+                dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv);
+            }
+            curV /= dims.d[dimIndex];
+            ASSERT(curV >= 0);
+        }
+
+        os << sep;
+        sep = separator;
+        print(os, typedBuffer[dataOffset]);
+    }
+}
+
+// Explicit instantiation
+template void dumpBuffer<bool>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<int32_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<int8_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<float>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<__half>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<BFloat16>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<uint8_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+template void dumpBuffer<int64_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
+    Dims const& strides, int32_t vectorDim, int32_t spv);
+
+template <typename T>
+void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
+{
+    auto const c = count / (k * trs);
+    sparseWeights.resize(count * sizeof(T));
+    auto* sparseValues = reinterpret_cast<T*>(sparseWeights.data());
+
+    constexpr int32_t window = 4;
+    constexpr int32_t nonzeros = 2;
+
+    int32_t const crs = c * trs;
+    auto const getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * trs + rsi; };
+
+    for (int64_t ki = 0; ki < k; ++ki)
+    {
+        for (int64_t rsi = 0; rsi < trs; ++rsi)
+        {
+            int32_t w = 0;
+            int32_t nz = 0;
+            for (int64_t ci = 0; ci < c; ++ci)
+            {
+                auto const index = getIndex(ki, ci, rsi);
+                if (nz < nonzeros)
+                {
+                    sparseValues[index] = values[index];
+                    ++nz;
+                }
+                else
+                {
+                    sparseValues[index] = 0;
+                }
+                if (++w == window)
+                {
+                    w = 0;
+                    nz = 0;
+                }
+            }
+        }
+    }
+}
+
+// Explicit instantiation
+template void sparsify<float>(
+    float const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
+template void sparsify<half_float::half>(
+    half_float::half const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
+
+template <typename T>
+void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n)
+{
+    ASSERT(dst != src);
+    T* tdst = reinterpret_cast<T*>(dst);
+    T const* tsrc = reinterpret_cast<T const*>(src);
+    for (int32_t mi = 0; mi < m; ++mi)
+    {
+        for (int32_t ni = 0; ni < n; ++ni)
+        {
+            int32_t const isrc = mi * n + ni;
+            int32_t const idst = ni * m + mi;
+            tdst[idst] = tsrc[isrc];
+        }
+    }
+}
+
+// Explicit instantiation
+template void transpose2DWeights<float>(void* dst, void const* src, int32_t const m, int32_t const n);
+template void transpose2DWeights<half_float::half>(void* dst, void const* src, int32_t const m, int32_t const n);
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type>
+void fillBuffer(void* buffer, int64_t volume, T min, T max)
+{
+    T* typedBuffer = static_cast<T*>(buffer);
+    std::default_random_engine engine;
+    std::uniform_int_distribution<int32_t> distribution(min, max);
+    auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
+    std::generate(typedBuffer, typedBuffer + volume, generator);
+}
+
+template <typename T, typename std::enable_if<!std::is_integral<T>::value, int32_t>::type>
+void fillBuffer(void* buffer, int64_t volume, T min, T max)
+{
+    T* typedBuffer = static_cast<T*>(buffer);
+    std::default_random_engine engine;
+    std::uniform_real_distribution<float> distribution(min, max);
+    auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
+    std::generate(typedBuffer, typedBuffer + volume, generator);
+}
+
+// Explicit instantiation
+template void fillBuffer<bool>(void* buffer, int64_t volume, bool min, bool max);
+template void fillBuffer<float>(void* buffer, int64_t volume, float min, float max);
+template void fillBuffer<int32_t>(void* buffer, int64_t volume, int32_t min, int32_t max);
+template void fillBuffer<int64_t>(void* buffer, int64_t volume, int64_t min, int64_t max);
+template void fillBuffer<int8_t>(void* buffer, int64_t volume, int8_t min, int8_t max);
+template void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max);
+template void fillBuffer<BFloat16>(void* buffer, int64_t volume, BFloat16 min, BFloat16 max);
+template void fillBuffer<uint8_t>(void* buffer, int64_t volume, uint8_t min, uint8_t max);
+
+bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target)
+{
+    auto const splitPattern = splitToStringVec(pattern, '*', 1);
+
+    // If there is no wildcard, return if the two strings match exactly.
+    if (splitPattern.size() == 1)
+    {
+        return pattern == target;
+    }
+
+    // Otherwise, target must follow prefix+anything+postfix pattern.
+    return target.size() >= (splitPattern[0].size() + splitPattern[1].size()) && target.find(splitPattern[0]) == 0
+        && target.rfind(splitPattern[1]) == (target.size() - splitPattern[1].size());
+}
+
+} // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/sampleUtils.h b/src/Detector/tensorrt_yolo/common/sampleUtils.h
index 1509a7fc..6cd4280b 100644
--- a/src/Detector/tensorrt_yolo/common/sampleUtils.h
+++ b/src/Detector/tensorrt_yolo/common/sampleUtils.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -22,6 +23,7 @@
 #include <memory>
 #include <numeric>
 #include <random>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -32,24 +34,20 @@
 
 #include "common.h"
 #include "logger.h"
-#include "sampleDevice.h"
-#include "sampleOptions.h"
+
+#define SMP_RETVAL_IF_FALSE(condition, msg, retval, err)                                                               \
+    {                                                                                                                  \
+        if ((condition) == false)                                                                                      \
+        {                                                                                                              \
+            (err) << (msg) << std::endl;                                                                               \
+            return retval;                                                                                             \
+        }                                                                                                              \
+    }
 
 namespace sample
 {
 
-inline int dataTypeSize(nvinfer1::DataType dataType)
-{
-    switch (dataType)
-    {
-    case nvinfer1::DataType::kINT32:
-    case nvinfer1::DataType::kFLOAT: return 4;
-    case nvinfer1::DataType::kHALF: return 2;
-    case nvinfer1::DataType::kBOOL:
-    case nvinfer1::DataType::kINT8: return 1;
-    }
-    return 0;
-}
+size_t dataTypeSize(nvinfer1::DataType dataType);
 
 template <typename T>
 inline T roundUp(T m, T n)
@@ -57,485 +55,71 @@ inline T roundUp(T m, T n)
     return ((m + n - 1) / n) * n;
 }
 
-inline int volume(const nvinfer1::Dims& d)
-{
-    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int>());
-}
-
 //! comps is the number of components in a vector. Ignored if vecDim < 0.
-inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch)
-{
-    int maxNbElems = 1;
-    for (int i = 0; i < dims.nbDims; ++i)
-    {
-        // Get effective length of axis.
-        int d = dims.d[i];
-        // Any dimension is 0, it is an empty tensor.
-        if (d == 0)
-        {
-            return 0;
-        }
-        if (i == vecDim)
-        {
-            d = samplesCommon::divUp(d, comps);
-        }
-        maxNbElems = std::max(maxNbElems, d * strides.d[i]);
-    }
-    return static_cast<int64_t>(maxNbElems) * batch * (vecDim < 0 ? 1 : comps);
-}
+int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch);
 
-inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch)
-{
-    if (vecDim != -1)
-    {
-        dims.d[vecDim] = roundUp(dims.d[vecDim], comps);
-    }
-    return volume(dims) * std::max(batch, 1);
-}
+using samplesCommon::volume;
 
-inline nvinfer1::Dims toDims(const std::vector<int>& vec)
-{
-    int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
-    if (static_cast<int>(vec.size()) > limit)
-    {
-        sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl;
-    }
-    // Pick first nvinfer1::Dims::MAX_DIMS elements
-    nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
-    std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
-    return dims;
-}
+nvinfer1::Dims toDims(std::vector<int32_t> const& vec);
 
-template <typename T>
-inline void fillBuffer(void* buffer, int64_t volume, T min, T max)
-{
-    T* typedBuffer = static_cast<T*>(buffer);
-    std::default_random_engine engine;
-    if (std::is_integral<T>::value)
-    {
-        std::uniform_int_distribution<int> distribution(min, max);
-        auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
-        std::generate(typedBuffer, typedBuffer + volume, generator);
-    }
-    else
-    {
-        std::uniform_real_distribution<float> distribution(min, max);
-        auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
-        std::generate(typedBuffer, typedBuffer + volume, generator);
-    }
-}
+template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+void fillBuffer(void* buffer, int64_t volume, T min, T max);
 
-// Specialization needed for custom type __half
-template <typename H>
-inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max)
-{
-    H* typedBuffer = static_cast<H*>(buffer);
-    std::default_random_engine engine;
-    std::uniform_real_distribution<float> distribution(min, max);
-    auto generator = [&engine, &distribution]() { return static_cast<H>(distribution(engine)); };
-    std::generate(typedBuffer, typedBuffer + volume, generator);
-}
-template <>
-inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max)
-{
-    fillBufferHalf(buffer, volume, min, max);
-}
+template <typename T, typename std::enable_if<!std::is_integral<T>::value, int32_t>::type = 0>
+void fillBuffer(void* buffer, int64_t volume, T min, T max);
 
 template <typename T>
-inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const nvinfer1::Dims& dims,
-    const nvinfer1::Dims& strides, int32_t vectorDim, int32_t spv)
-{
-    const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int64_t>());
-    const T* typedBuffer = static_cast<const T*>(buffer);
-    std::string sep;
-    for (int64_t v = 0; v < volume; ++v)
-    {
-        int64_t curV = v;
-        int32_t dataOffset = 0;
-        for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex)
-        {
-            int32_t dimVal = curV % dims.d[dimIndex];
-            if (dimIndex == vectorDim)
-            {
-                dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv;
-            }
-            else
-            {
-                dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv);
-            }
-            curV /= dims.d[dimIndex];
-            ASSERT(curV >= 0);
-        }
-
-        os << sep << typedBuffer[dataOffset];
-        sep = separator;
-    }
-}
-
-inline void loadFromFile(std::string const& fileName, char* dst, size_t size)
-{
-    ASSERT(dst);
-
-    std::ifstream file(fileName, std::ios::in | std::ios::binary);
-    if (file.is_open())
-    {
-        file.read(dst, size);
-        file.close();
-    }
-    else
-    {
-        std::stringstream msg;
-        msg << "Cannot open file " << fileName << "!";
-        throw std::invalid_argument(msg.str());
-    }
-}
-
-struct Binding
-{
-    bool isInput{false};
-    std::unique_ptr<IMirroredBuffer> buffer;
-    int64_t volume{0};
-    nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT};
-
-    void fill(const std::string& fileName)
-    {
-        loadFromFile(fileName, static_cast<char*>(buffer->getHostBuffer()), buffer->getSize());
-    }
-
-    void fill()
-    {
-        switch (dataType)
-        {
-        case nvinfer1::DataType::kBOOL:
-        {
-            fillBuffer<bool>(buffer->getHostBuffer(), volume, 0, 1);
-            break;
-        }
-        case nvinfer1::DataType::kINT32:
-        {
-            fillBuffer<int32_t>(buffer->getHostBuffer(), volume, -128, 127);
-            break;
-        }
-        case nvinfer1::DataType::kINT8:
-        {
-            fillBuffer<int8_t>(buffer->getHostBuffer(), volume, -128, 127);
-            break;
-        }
-        case nvinfer1::DataType::kFLOAT:
-        {
-            fillBuffer<float>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
-            break;
-        }
-        case nvinfer1::DataType::kHALF:
-        {
-            fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
-            break;
-        }
-        }
-    }
-
-    void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv,
-        const std::string separator = " ") const
-    {
-        switch (dataType)
-        {
-        case nvinfer1::DataType::kBOOL:
-        {
-            dumpBuffer<bool>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        case nvinfer1::DataType::kINT32:
-        {
-            dumpBuffer<int32_t>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        case nvinfer1::DataType::kINT8:
-        {
-            dumpBuffer<int8_t>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        case nvinfer1::DataType::kFLOAT:
-        {
-            dumpBuffer<float>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        case nvinfer1::DataType::kHALF:
-        {
-            dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
-            break;
-        }
-        }
-    }
-};
-
-class Bindings
-{
-public:
-    Bindings() = delete;
-    explicit Bindings(bool useManaged)
-        : mUseManaged(useManaged)
-    {
-    }
-
-    void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType,
-        const std::string& fileName = "")
-    {
-        while (mBindings.size() <= static_cast<size_t>(b))
-        {
-            mBindings.emplace_back();
-            mDevicePointers.emplace_back();
-        }
-        mNames[name] = b;
-        if (mBindings[b].buffer == nullptr)
-        {
-            if (mUseManaged)
-                mBindings[b].buffer.reset(new UnifiedMirroredBuffer);
-            else
-                mBindings[b].buffer.reset(new DiscreteMirroredBuffer);
-        }
-        mBindings[b].isInput = isInput;
-        // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
-        // even for empty tensors, so allocate a dummy byte.
-        if (volume == 0)
-            mBindings[b].buffer->allocate(1);
-        else
-            mBindings[b].buffer->allocate(static_cast<size_t>(volume) * static_cast<size_t>(dataTypeSize(dataType)));
-
-        mBindings[b].volume = volume;
-        mBindings[b].dataType = dataType;
-        mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer();
-        if (isInput)
-        {
-            if (fileName.empty())
-                fill(b);
-            else
-                fill(b, fileName);
-        }
-    }
-
-    void** getDeviceBuffers()
-    {
-        return mDevicePointers.data();
-    }
-
-    void transferInputToDevice(TrtCudaStream& stream)
-    {
-        for (auto& b : mNames)
-        {
-            if (mBindings[b.second].isInput)
-                mBindings[b.second].buffer->hostToDevice(stream);
-        }
-    }
-
-    void transferOutputToHost(TrtCudaStream& stream)
-    {
-        for (auto& b : mNames)
-        {
-            if (!mBindings[b.second].isInput)
-                mBindings[b.second].buffer->deviceToHost(stream);
-        }
-    }
-
-    void fill(int binding, const std::string& fileName)
-    {
-        mBindings[binding].fill(fileName);
-    }
-
-    void fill(int binding)
-    {
-        mBindings[binding].fill();
-    }
+void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, nvinfer1::Dims const& dims,
+    nvinfer1::Dims const& strides, int32_t vectorDim, int32_t spv);
 
-    void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const
-    {
-        const auto dims = context.getBindingDimensions(binding);
-        // Do not add a newline terminator, because the caller may be outputting a JSON string.
-        os << dims;
-    }
-
-    void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os,
-        const std::string& separator = " ", int32_t batch = 1) const
-    {
-        nvinfer1::Dims dims = context.getBindingDimensions(binding);
-        nvinfer1::Dims strides = context.getStrides(binding);
-        int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding);
-        const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding);
+void loadFromFile(std::string const& fileName, char* dst, size_t size);
 
-        if (context.getEngine().hasImplicitBatchDimension())
-        {
-            auto insertN = [](nvinfer1::Dims& d, int32_t bs) {
-                const int32_t nbDims = d.nbDims;
-                ASSERT(nbDims < nvinfer1::Dims::MAX_DIMS);
-                std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]);
-                d.d[0] = bs;
-                d.nbDims = nbDims + 1;
-            };
-            int32_t batchStride = 0;
-            for (int32_t i = 0; i < strides.nbDims; ++i)
-            {
-                if (strides.d[i] * dims.d[i] > batchStride)
-                {
-                    batchStride = strides.d[i] * dims.d[i];
-                }
-            }
-            insertN(dims, batch);
-            insertN(strides, batchStride);
-            vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1;
-        }
-
-        mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator);
-    }
-
-    void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
-    {
-        auto isInput = [](const Binding& b) { return b.isInput; };
-        dumpBindings(context, isInput, os);
-    }
+std::vector<std::string> splitToStringVec(std::string const& option, char separator, int64_t maxSplit = -1);
 
-    void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
-    {
-        auto isOutput = [](const Binding& b) { return !b.isInput; };
-        dumpBindings(context, isOutput, os);
-    }
+bool broadcastIOFormats(std::vector<IOFormat> const& formats, size_t nbBindings, bool isInput = true);
 
-    void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const
-    {
-        auto all = [](const Binding& /*b*/) { return true; };
-        dumpBindings(context, all, os);
-    }
+int32_t getCudaDriverVersion();
 
-    void dumpBindings(
-        const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const
-    {
-        for (const auto& n : mNames)
-        {
-            const auto binding = n.second;
-            if (predicate(mBindings[binding]))
-            {
-                os << n.first << ": (";
-                dumpBindingDimensions(binding, context, os);
-                os << ")" << std::endl;
+int32_t getCudaRuntimeVersion();
 
-                dumpBindingValues(context, binding, os);
-                os << std::endl;
-            }
-        }
-    }
+void sparsify(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights);
+void sparsify(nvinfer1::Weights const& weights, int32_t k, int32_t rs, std::vector<int8_t>& sparseWeights);
 
-    std::unordered_map<std::string, int> getInputBindings() const
-    {
-        auto isInput = [](const Binding& b) { return b.isInput; };
-        return getBindings(isInput);
-    }
-
-    std::unordered_map<std::string, int> getOutputBindings() const
-    {
-        auto isOutput = [](const Binding& b) { return !b.isInput; };
-        return getBindings(isOutput);
-    }
-
-    std::unordered_map<std::string, int> getBindings() const
-    {
-        auto all = [](const Binding& /*b*/) { return true; };
-        return getBindings(all);
-    }
+// Walk the weights elements and overwrite (at most) 2 out of 4 elements to 0.
+template <typename T>
+void sparsify(T const* values, int64_t count, int32_t k, int32_t rs, std::vector<int8_t>& sparseWeights);
 
-    std::unordered_map<std::string, int> getBindings(bool (*predicate)(const Binding& b)) const
-    {
-        std::unordered_map<std::string, int> bindings;
-        for (const auto& n : mNames)
-        {
-            const auto binding = n.second;
-            if (predicate(mBindings[binding]))
-                bindings.insert(n);
-        }
-        return bindings;
-    }
+template <typename L>
+void setSparseWeights(L& l, int32_t k, int32_t rs, std::vector<int8_t>& sparseWeights);
 
-private:
-    std::unordered_map<std::string, int32_t> mNames;
-    std::vector<Binding> mBindings;
-    std::vector<void*> mDevicePointers;
-    bool mUseManaged{false};
-};
+// Sparsify the weights of Constant layers that are fed to MatMul via Shuffle layers.
+// Forward analysis on the API graph to determine which weights to sparsify.
+void sparsifyMatMulKernelWeights(
+    nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights);
 
 template <typename T>
-struct TrtDestroyer
-{
-    void operator()(T* t)
-    {
-        //t->destroy();
-        delete t;
-    }
-};
+void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n);
 
-template <typename T>
-using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T>>;
+//! A helper function to match a target string with a pattern where the pattern can contain up to one wildcard ('*')
+//! character that matches to any strings.
+bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target);
 
-inline bool broadcastIOFormats(const std::vector<IOFormat>& formats, size_t nbBindings, bool isInput = true)
-{
-    bool broadcast = formats.size() == 1;
-    bool validFormatsCount = broadcast || (formats.size() == nbBindings);
-    if (!formats.empty() && !validFormatsCount)
-    {
-        if (isInput)
-        {
-            throw std::invalid_argument(
-                "The number of inputIOFormats must match network's inputs or be one for broadcasting.");
-        }
-        else
-        {
-            throw std::invalid_argument(
-                "The number of outputIOFormats must match network's outputs or be one for broadcasting.");
-        }
-    }
-    return broadcast;
-}
-
-inline std::vector<char> loadTimingCacheFile(const std::string inFileName)
-{
-    std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
-    if (!iFile)
-    {
-        sample::gLogWarning << "Could not read timing cache from: " << inFileName
-                            << ". A new timing cache will be generated and written." << std::endl;
-        return std::vector<char>();
-    }
-    iFile.seekg(0, std::ifstream::end);
-    size_t fsize = iFile.tellg();
-    iFile.seekg(0, std::ifstream::beg);
-    std::vector<char> content(fsize);
-    iFile.read(content.data(), fsize);
-    iFile.close();
-    sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl;
-    return content;
-}
-
-inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob)
+//! A helper method to find an item from an unordered_map. If the exact match exists, this is identical to
+//! map.find(target). If the exact match does not exist, it returns the first plausible match, taking up to one wildcard
+//! into account. If there is no plausible match, then it returns map.end().
+template <typename T>
+typename std::unordered_map<std::string, T>::const_iterator findPlausible(
+    std::unordered_map<std::string, T> const& map, std::string const& target)
 {
-    std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
-    if (!oFile)
+    auto res = map.find(target);
+    if (res == map.end())
     {
-        sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl;
-        return;
+        res = std::find_if(
+            map.begin(), map.end(), [&](typename std::unordered_map<std::string, T>::value_type const& item) {
+                return matchStringWithOneWildcard(item.first, target);
+            });
     }
-    oFile.write((char*) blob->data(), blob->size());
-    oFile.close();
-    sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl;
-}
-
-inline int32_t getCudaDriverVersion()
-{
-    int32_t version{-1};
-    cudaCheck(cudaDriverGetVersion(&version));
-    return version;
-}
-
-inline int32_t getCudaRuntimeVersion()
-{
-    int32_t version{-1};
-    cudaCheck(cudaRuntimeGetVersion(&version));
-    return version;
+    return res;
 }
 
 } // namespace sample
diff --git a/src/Detector/tensorrt_yolo/common/streamReader.h b/src/Detector/tensorrt_yolo/common/streamReader.h
new file mode 100644
index 00000000..7d4aa1c6
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/streamReader.h
@@ -0,0 +1,78 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef STREAM_READER_H
+#define STREAM_READER_H
+
+#include "NvInferRuntime.h"
+#include "sampleUtils.h"
+#include <iostream>
+
+namespace samplesCommon
+{
+
+//! Implements the TensorRT IStreamReader to allow deserializing an engine directly from the plan file.
+class FileStreamReader final : public nvinfer1::IStreamReader
+{
+public:
+    bool open(std::string filepath)
+    {
+        mFile.open(filepath, std::ios::binary);
+        return mFile.is_open();
+    }
+
+    void close()
+    {
+        if (mFile.is_open())
+        {
+            mFile.close();
+        }
+    }
+
+    ~FileStreamReader() final
+    {
+        close();
+    }
+
+    int64_t read(void* dest, int64_t bytes) final
+    {
+        if (!mFile.good())
+        {
+            return -1;
+        }
+        mFile.read(static_cast<char*>(dest), bytes);
+        return mFile.gcount();
+    }
+
+    void reset()
+    {
+        assert(mFile.good());
+        mFile.seekg(0);
+    }
+
+    bool isOpen() const
+    {
+        return mFile.is_open();
+    }
+
+private:
+    std::ifstream mFile;
+};
+
+} // namespace samplesCommon
+
+#endif // STREAM_READER_H
diff --git a/src/Detector/tensorrt_yolo/common/timingCache.cpp b/src/Detector/tensorrt_yolo/common/timingCache.cpp
new file mode 100644
index 00000000..18e85ba4
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/timingCache.cpp
@@ -0,0 +1,157 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "timingCache.h"
+#include "NvInfer.h"
+#include "fileLock.h"
+#include "sampleUtils.h"
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+using namespace nvinfer1;
+namespace nvinfer1
+{
+namespace utils
+{
+std::vector<char> loadTimingCacheFile(ILogger& logger, std::string const& inFileName)
+{
+    try
+    {
+        std::unique_ptr<FileLock> fileLock{new FileLock(logger, inFileName)};
+        std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
+        if (!iFile)
+        {
+            std::stringstream ss;
+            ss << "Could not read timing cache from: " << inFileName
+               << ". A new timing cache will be generated and written.";
+            logger.log(ILogger::Severity::kWARNING, ss.str().c_str());
+            return std::vector<char>();
+        }
+        iFile.seekg(0, std::ifstream::end);
+        size_t fsize = iFile.tellg();
+        iFile.seekg(0, std::ifstream::beg);
+        std::vector<char> content(fsize);
+        iFile.read(content.data(), fsize);
+        iFile.close();
+        std::stringstream ss;
+        ss << "Loaded " << fsize << " bytes of timing cache from " << inFileName;
+        logger.log(ILogger::Severity::kINFO, ss.str().c_str());
+        return content;
+    }
+    catch (std::exception const& e)
+    {
+        std::cerr << "Exception detected: " << e.what() << std::endl;
+    }
+    return {};
+}
+
+std::unique_ptr<ITimingCache> buildTimingCacheFromFile(
+    ILogger& logger, IBuilderConfig& config, std::string const& timingCacheFile, std::ostream& err)
+{
+    std::unique_ptr<nvinfer1::ITimingCache> timingCache{};
+    auto timingCacheContents = loadTimingCacheFile(logger, timingCacheFile);
+    timingCache.reset(config.createTimingCache(timingCacheContents.data(), timingCacheContents.size()));
+    SMP_RETVAL_IF_FALSE(timingCache != nullptr, "TimingCache creation failed", nullptr, err);
+    config.clearFlag(BuilderFlag::kDISABLE_TIMING_CACHE);
+    SMP_RETVAL_IF_FALSE(
+        config.setTimingCache(*timingCache, true), "IBuilderConfig setTimingCache failed", nullptr, err);
+    return timingCache;
+}
+
+void saveTimingCacheFile(ILogger& logger, std::string const& outFileName, IHostMemory const* blob)
+{
+    try
+    {
+        std::unique_ptr<FileLock> fileLock{new FileLock(logger, outFileName)};
+        std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
+        if (!oFile)
+        {
+            std::stringstream ss;
+            ss << "Could not write timing cache to: " << outFileName;
+            logger.log(ILogger::Severity::kWARNING, ss.str().c_str());
+            return;
+        }
+        oFile.write(reinterpret_cast<char*>(blob->data()), blob->size());
+        oFile.close();
+        std::stringstream ss;
+        ss << "Saved " << blob->size() << " bytes of timing cache to " << outFileName;
+        logger.log(ILogger::Severity::kINFO, ss.str().c_str());
+    }
+    catch (std::exception const& e)
+    {
+        std::cerr << "Exception detected: " << e.what() << std::endl;
+    }
+}
+
+void updateTimingCacheFile(nvinfer1::ILogger& logger, std::string const& fileName,
+    nvinfer1::ITimingCache const* timingCache, nvinfer1::IBuilder& builder)
+{
+    try
+    {
+        // Prepare empty timingCache in case that there is no existing file to read
+        std::unique_ptr<IBuilderConfig> config{builder.createBuilderConfig()};
+        std::unique_ptr<ITimingCache> fileTimingCache{config->createTimingCache(static_cast<void const*>(nullptr), 0)};
+
+        std::unique_ptr<FileLock> fileLock{new FileLock(logger, fileName)};
+        std::ifstream iFile(fileName, std::ios::in | std::ios::binary);
+        if (iFile)
+        {
+            iFile.seekg(0, std::ifstream::end);
+            size_t fsize = iFile.tellg();
+            iFile.seekg(0, std::ifstream::beg);
+            std::vector<char> content(fsize);
+            iFile.read(content.data(), fsize);
+            iFile.close();
+            std::stringstream ss;
+            ss << "Loaded " << fsize << " bytes of timing cache from " << fileName;
+            logger.log(ILogger::Severity::kINFO, ss.str().c_str());
+            fileTimingCache.reset(config->createTimingCache(static_cast<void const*>(content.data()), content.size()));
+            if (!fileTimingCache)
+            {
+                throw std::runtime_error("Failed to create timingCache from " + fileName + "!");
+            }
+        }
+        fileTimingCache->combine(*timingCache, false);
+        std::unique_ptr<IHostMemory> blob{fileTimingCache->serialize()};
+        if (!blob)
+        {
+            throw std::runtime_error("Failed to serialize ITimingCache!");
+        }
+        std::ofstream oFile(fileName, std::ios::out | std::ios::binary);
+        if (!oFile)
+        {
+            std::stringstream ss;
+            ss << "Could not write timing cache to: " << fileName;
+            logger.log(ILogger::Severity::kWARNING, ss.str().c_str());
+            return;
+        }
+        oFile.write(reinterpret_cast<char*>(blob->data()), blob->size());
+        oFile.close();
+        std::stringstream ss;
+        ss << "Saved " << blob->size() << " bytes of timing cache to " << fileName;
+        logger.log(ILogger::Severity::kINFO, ss.str().c_str());
+    }
+    catch (std::exception const& e)
+    {
+        std::cerr << "Exception detected: " << e.what() << std::endl;
+    }
+}
+} // namespace utils
+} // namespace nvinfer1
diff --git a/src/Detector/tensorrt_yolo/common/timingCache.h b/src/Detector/tensorrt_yolo/common/timingCache.h
new file mode 100644
index 00000000..c4c76e37
--- /dev/null
+++ b/src/Detector/tensorrt_yolo/common/timingCache.h
@@ -0,0 +1,38 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_
+#define TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_
+#include "NvInfer.h"
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace nvinfer1
+{
+namespace utils
+{
+std::vector<char> loadTimingCacheFile(nvinfer1::ILogger& logger, std::string const& inFileName);
+std::unique_ptr<ITimingCache> buildTimingCacheFromFile(
+    ILogger& logger, IBuilderConfig& config, std::string const& timingCacheFile, std::ostream& err);
+void saveTimingCacheFile(nvinfer1::ILogger& logger, std::string const& outFileName, nvinfer1::IHostMemory const* blob);
+void updateTimingCacheFile(nvinfer1::ILogger& logger, std::string const& fileName,
+    nvinfer1::ITimingCache const* timingCache, nvinfer1::IBuilder& builder);
+} // namespace utils
+} // namespace nvinfer1
+
+#endif // TENSORRT_SAMPLES_COMMON_TIMINGCACHE_H_
diff --git a/src/Detector/tensorrt_yolo/ds_image.cpp b/src/Detector/tensorrt_yolo/ds_image.cpp
index b801b874..77404f97 100644
--- a/src/Detector/tensorrt_yolo/ds_image.cpp
+++ b/src/Detector/tensorrt_yolo/ds_image.cpp
@@ -50,7 +50,8 @@ DsImage::DsImage(const cv::Mat& mat_image_, tensor_rt::ModelType net_type, const
     if (tensor_rt::ModelType::YOLOV5 == net_type || tensor_rt::ModelType::YOLOV6 == net_type ||
 		tensor_rt::ModelType::YOLOV7 == net_type || tensor_rt::ModelType::YOLOV7Mask == net_type ||
 		tensor_rt::ModelType::YOLOV8 == net_type || tensor_rt::ModelType::YOLOV8_OBB == net_type || tensor_rt::ModelType::YOLOV8Mask == net_type ||
-		tensor_rt::ModelType::YOLOV9 == net_type || tensor_rt::ModelType::YOLOV10 == net_type)
+		tensor_rt::ModelType::YOLOV9 == net_type || tensor_rt::ModelType::YOLOV10 == net_type ||
+		tensor_rt::ModelType::YOLOV11 == net_type || tensor_rt::ModelType::YOLOV11_OBB == net_type || tensor_rt::ModelType::YOLOV11Mask == net_type)
 	{
 		// resize the DsImage with scale
 		float r = std::min(static_cast<float>(inputH) / static_cast<float>(m_Height), static_cast<float>(inputW) / static_cast<float>(m_Width));
@@ -101,7 +102,8 @@ DsImage::DsImage(const std::string& path, tensor_rt::ModelType net_type, const i
     if (tensor_rt::ModelType::YOLOV5 == net_type || tensor_rt::ModelType::YOLOV6 == net_type ||
 		tensor_rt::ModelType::YOLOV7 == net_type || tensor_rt::ModelType::YOLOV7Mask == net_type ||
 		tensor_rt::ModelType::YOLOV8 == net_type || tensor_rt::ModelType::YOLOV8_OBB == net_type || tensor_rt::ModelType::YOLOV8Mask == net_type ||
-		tensor_rt::ModelType::YOLOV9 == net_type || tensor_rt::ModelType::YOLOV10 == net_type)
+		tensor_rt::ModelType::YOLOV9 == net_type || tensor_rt::ModelType::YOLOV10 == net_type ||
+		tensor_rt::ModelType::YOLOV11 == net_type || tensor_rt::ModelType::YOLOV11_OBB == net_type || tensor_rt::ModelType::YOLOV11Mask == net_type)
 	{
 		// resize the DsImage with scale
 		float dim = std::max(m_Height, m_Width);
diff --git a/src/Detector/tensorrt_yolo/yolo.cpp b/src/Detector/tensorrt_yolo/yolo.cpp
index a60d3dc4..4ee202b6 100644
--- a/src/Detector/tensorrt_yolo/yolo.cpp
+++ b/src/Detector/tensorrt_yolo/yolo.cpp
@@ -78,7 +78,31 @@ Yolo::Yolo(const NetworkInfo& networkInfo, const InferParams& inferParams)
 	assert(m_Engine != nullptr);
 	m_Context = m_Engine->createExecutionContext();
 	assert(m_Context != nullptr);
+
+	auto numBindings = m_Engine->getNbIOTensors();
+	//std::cout << "** Bindings: " << numBindings << " **" << std::endl;
+	for (int32_t i = 0; i < numBindings; ++i)
+	{
+		std::string bindName = m_Engine->getIOTensorName(i);
+		m_tensorNames.emplace(bindName, i);
+		nvinfer1::Dims dim = m_Engine->getTensorShape(bindName.c_str());
+
+		std::cout << i << ": name: " << bindName;
+		std::cout << ", size: ";
+		for (int j = 0; j < dim.nbDims; ++j)
+		{
+			std::cout << dim.d[j];
+			if (j < dim.nbDims - 1)
+				std::cout << "x";
+		}
+		std::cout << std::endl;
+
+		if (m_InputBlobName == bindName)
+			m_InputBindingIndex = i;
+	}
+#if (NV_TENSORRT_MAJOR < 9)
 	m_InputBindingIndex = m_Engine->getBindingIndex(m_InputBlobName.c_str());
+#endif
 	assert(m_InputBindingIndex != -1);
 	assert(m_BatchSize <= static_cast<uint32_t>(m_Engine->getMaxBatchSize()));
 	allocateBuffers();
@@ -464,7 +488,14 @@ void Yolo::createYOLOEngine(const nvinfer1::DataType dataType, Int8EntropyCalibr
 
     // Build the engine
     std::cout << "Building the TensorRT Engine..." << std::endl;
-    m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config);
+#if (NV_TENSORRT_MAJOR < 9)
+	m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config);
+#else
+	nvinfer1::IRuntime* inferRuntime = nvinfer1::createInferRuntime(m_Logger);
+	nvinfer1::IHostMemory* serialNetmork = m_Builder->buildSerializedNetwork(*m_Network, *config);
+	m_Engine = inferRuntime->deserializeCudaEngine(serialNetmork->data(), serialNetmork->size());
+	delete inferRuntime;
+#endif
     assert(m_Engine != nullptr);
     std::cout << "Building complete!" << std::endl;
 
@@ -942,7 +973,15 @@ void Yolo::create_engine_yolov5(const nvinfer1::DataType dataType, Int8EntropyCa
 #endif
 	// Build the engine
 	std::cout << "Building the TensorRT Engine..." << std::endl;
+#if (NV_TENSORRT_MAJOR < 9)
 	m_Engine = m_Builder->buildEngineWithConfig(*m_Network, *config);
+#else
+	nvinfer1::IRuntime* inferRuntime = nvinfer1::createInferRuntime(m_Logger);
+	nvinfer1::IHostMemory* serialNetmork = m_Builder->buildSerializedNetwork(*m_Network, *config);
+	m_Engine = inferRuntime->deserializeCudaEngine(serialNetmork->data(), serialNetmork->size());
+	delete inferRuntime;
+#endif
+
 	assert(m_Engine != nullptr);
 	std::cout << "Building complete!" << std::endl;
 
@@ -987,7 +1026,8 @@ void Yolo::doInference(const unsigned char* input, const uint32_t batchSize)
                                   batchSize * m_InputSize * sizeof(float), cudaMemcpyHostToDevice,
                                   m_CudaStream));
 
-    m_Context->enqueue(batchSize, m_DeviceBuffers.data(), m_CudaStream, nullptr);
+    //m_Context->enqueueV3(batchSize, m_DeviceBuffers.data(), m_CudaStream, nullptr);
+	m_Context->enqueueV3(m_CudaStream);
     for (auto& tensor : m_OutputTensors)
     {
         NV_CUDA_CHECK(cudaMemcpyAsync(tensor.hostBuffer, m_DeviceBuffers.at(tensor.bindingIndex),
@@ -1249,8 +1289,7 @@ void Yolo::parse_cfg_blocks_v5(const  std::vector<std::map<std::string, std::str
 				}
 				outputTensor.stride_h = m_InputH / outputTensor.grid_h;
 				outputTensor.stride_w = m_InputW / outputTensor.grid_w;
-				outputTensor.volume = outputTensor.grid_h * outputTensor.grid_w
-					*(outputTensor.numBBoxes*(5 + outputTensor.numClasses));
+				outputTensor.volume = outputTensor.grid_h * outputTensor.grid_w*(outputTensor.numBBoxes*(5 + outputTensor.numClasses));
 				m_OutputTensors.push_back(outputTensor);
 
 				if (m_ClassNames.empty())
@@ -1268,19 +1307,21 @@ void Yolo::parse_cfg_blocks_v5(const  std::vector<std::map<std::string, std::str
 
 void Yolo::allocateBuffers()
 {
-    m_DeviceBuffers.resize(m_Engine->getNbBindings(), nullptr);
+    m_DeviceBuffers.resize(m_Engine->getNbIOTensors(), nullptr);
     assert(m_InputBindingIndex != -1 && "Invalid input binding index");
-    NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(m_InputBindingIndex),
-                             m_BatchSize * m_InputSize * sizeof(float)));
+    NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(m_InputBindingIndex), m_BatchSize * m_InputSize * sizeof(float)));
 
     for (auto& tensor : m_OutputTensors)
     {
+#if (NV_TENSORRT_MAJOR < 9)
         tensor.bindingIndex = m_Engine->getBindingIndex(tensor.blobName.c_str());
+#else
+		auto it = m_tensorNames.find(tensor.blobName);
+		tensor.bindingIndex = (it != std::end(m_tensorNames)) ? it->second : -1;
+#endif
         assert((tensor.bindingIndex != -1) && "Invalid output binding index");
-        NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(tensor.bindingIndex),
-                                 m_BatchSize * tensor.volume * sizeof(float)));
-        NV_CUDA_CHECK(
-            cudaMallocHost(&tensor.hostBuffer, tensor.volume * m_BatchSize * sizeof(float)));
+        NV_CUDA_CHECK(cudaMalloc(&m_DeviceBuffers.at(tensor.bindingIndex), m_BatchSize * tensor.volume * sizeof(float)));
+        NV_CUDA_CHECK(cudaMallocHost((void**)&tensor.hostBuffer, tensor.volume * m_BatchSize * sizeof(float)));
     }
 }
 
diff --git a/src/Detector/tensorrt_yolo/yolo.h b/src/Detector/tensorrt_yolo/yolo.h
index be347d19..4cfdba16 100644
--- a/src/Detector/tensorrt_yolo/yolo.h
+++ b/src/Detector/tensorrt_yolo/yolo.h
@@ -158,6 +158,7 @@ class Yolo
     std::vector<void*> m_DeviceBuffers;
     int m_InputBindingIndex = -1;
     cudaStream_t m_CudaStream = nullptr;
+    std::map<std::string, int> m_tensorNames;
 
     virtual std::vector<BBoxInfo> decodeTensor(const int imageIdx, const int imageH, const int imageW, const TensorInfo& tensor) = 0;