diff --git a/perception/autoware_tensorrt_yolov10/CMakeLists.txt b/perception/autoware_tensorrt_yolov10/CMakeLists.txt new file mode 100644 index 0000000000000..daaa0c5ed77ba --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/CMakeLists.txt @@ -0,0 +1,175 @@ +cmake_minimum_required(VERSION 3.17) +project(autoware_tensorrt_yolov10) + +find_package(tensorrt_common) +if(NOT ${tensorrt_common_FOUND}) + message(WARNING "The tensorrt_common package is not found. Please check its dependencies.") + return() +endif() + +find_package(autoware_cmake REQUIRED) +autoware_package() + +find_package(OpenCV REQUIRED) + +option(CUDA_VERBOSE "Verbose output of CUDA modules" OFF) + +# set flags for CUDA availability +option(CUDA_AVAIL "CUDA available" OFF) +find_package(CUDA) +if(CUDA_FOUND) + find_library(CUBLAS_LIBRARIES cublas HINTS + ${CUDA_TOOLKIT_ROOT_DIR}/lib64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib + ) + if(CUDA_VERBOSE) + message("CUDA is available!") + message("CUDA Libs: ${CUDA_LIBRARIES}") + message("CUDA Headers: ${CUDA_INCLUDE_DIRS}") + endif() + # Note: cublas_device was depreciated in CUDA version 9.2 + # https://forums.developer.nvidia.com/t/where-can-i-find-libcublas-device-so-or-libcublas-device-a/67251/4 + # In LibTorch, CUDA_cublas_device_LIBRARY is used. + unset(CUDA_cublas_device_LIBRARY CACHE) + set(CUDA_AVAIL ON) +else() + message("CUDA NOT FOUND") + set(CUDA_AVAIL OFF) +endif() + +# set flags for TensorRT availability +option(TRT_AVAIL "TensorRT available" OFF) +# try to find the tensorRT modules +find_library(NVINFER nvinfer) +find_library(NVONNXPARSER nvonnxparser) +if(NVINFER AND NVONNXPARSER) + if(CUDA_VERBOSE) + message("TensorRT is available!") + message("NVINFER: ${NVINFER}") + message("NVONNXPARSER: ${NVONNXPARSER}") + endif() + set(TRT_AVAIL ON) +else() + message("TensorRT is NOT Available") + set(TRT_AVAIL OFF) +endif() + +# set flags for CUDNN availability +option(CUDNN_AVAIL "CUDNN available" OFF) +# try to find the CUDNN module +find_library(CUDNN_LIBRARY +NAMES libcudnn.so${__cudnn_ver_suffix} libcudnn${__cudnn_ver_suffix}.dylib ${__cudnn_lib_win_name} +PATHS $ENV{LD_LIBRARY_PATH} ${__libpath_cudart} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} +PATH_SUFFIXES lib lib64 bin +DOC "CUDNN library." +) +if(CUDNN_LIBRARY) + if(CUDA_VERBOSE) + message(STATUS "CUDNN is available!") + message(STATUS "CUDNN_LIBRARY: ${CUDNN_LIBRARY}") + endif() + set(CUDNN_AVAIL ON) +else() + message("CUDNN is NOT Available") + set(CUDNN_AVAIL OFF) +endif() + +find_package(OpenMP) +if(OpenMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +########## +# tensorrt_yolov10 +ament_auto_add_library(${PROJECT_NAME} SHARED + src/tensorrt_yolov10.cpp + src/tensorrt_yolov10_node.cpp +) + +ament_target_dependencies(${PROJECT_NAME} + OpenCV +) + +if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL) + # Officially, add_library supports .cu file compilation. + # However, as of cmake 3.22.1, it seems to fail compilation because compiler flags for + # C++ are directly passed to nvcc (they are originally space separated + # but nvcc assume comma separated as argument of `-Xcompiler` option). + # That is why `cuda_add_library` is used here. +# cuda_add_library(${PROJECT_NAME}_gpu_preprocess +# SHARED +# src/preprocess.cu +# ) + +# target_include_directories(${PROJECT_NAME}_gpu_preprocess PUBLIC +# $ +# $ +# ) + +# target_link_libraries(${PROJECT_NAME} +# ${tensorrt_common_LIBRARIES} +# ${PROJECT_NAME}_gpu_preprocess +# ) +else() + target_link_libraries(${PROJECT_NAME} + ${tensorrt_common_LIBRARIES} + ) +endif() + +target_compile_definitions(${PROJECT_NAME} PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +ament_auto_add_library(yolov10_single_image_inference_node SHARED + src/yolov10_single_image_inference_node.cpp +) + +ament_target_dependencies(yolov10_single_image_inference_node + OpenCV +) + +target_link_libraries(yolov10_single_image_inference_node + ${PROJECT_NAME} + stdc++fs +) + +target_compile_definitions(yolov10_single_image_inference_node PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +rclcpp_components_register_node(yolov10_single_image_inference_node + PLUGIN "autoware::tensorrt_yolov10::Yolov10SingleImageInferenceNode" + EXECUTABLE yolov10_single_image_inference +) + +ament_auto_add_library(${PROJECT_NAME}_node SHARED + src/tensorrt_yolov10_node.cpp +) + +ament_target_dependencies(${PROJECT_NAME}_node + OpenCV +) + +target_link_libraries(${PROJECT_NAME}_node + ${PROJECT_NAME} +) + +target_compile_definitions(${PROJECT_NAME}_node PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +rclcpp_components_register_node(${PROJECT_NAME}_node + PLUGIN "autoware::tensorrt_yolov10::TrtYolov10Node" + EXECUTABLE ${PROJECT_NAME}_node_exe +) + +if(BUILD_TESTING) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() +endif() + +ament_auto_package(INSTALL_TO_SHARE + launch + config +) diff --git a/perception/autoware_tensorrt_yolov10/README.md b/perception/autoware_tensorrt_yolov10/README.md new file mode 100644 index 0000000000000..aa813d5ad5a04 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/README.md @@ -0,0 +1,136 @@ +# autoware_tensorrt_yolov10 + +## Purpose + +This package detects target objects e.g., cars, trucks, bicycles, pedestrians,etc on a image based on [YOLOV10](https://github.com/THU-MIG/yolov10) model. + +## Inputs / Outputs + +### Input + +| Name | Type | Description | +| ---------- | ------------------- | --------------- | +| `in/image` | `sensor_msgs/Image` | The input image | + +### Output + +| Name | Type | Description | +| ------------- | -------------------------------------------------- | -------------------------------------------------- | +| `out/objects` | `tier4_perception_msgs/DetectedObjectsWithFeature` | The detected objects with 2D bounding boxes | +| `out/image` | `sensor_msgs/Image` | The image with 2D bounding boxes for visualization | + +## Assumptions / Known limits + +The label contained in detected 2D bounding boxes (i.e., `out/objects`) will be either one of the followings: + +- CAR +- PEDESTRIAN ("PERSON" will also be categorized as "PEDESTRIAN") +- BUS +- TRUCK +- BICYCLE +- MOTORCYCLE + +If other labels (case insensitive) are contained in the file specified via the `label_file` parameter, +those are labeled as `UNKNOWN`, while detected rectangles are drawn in the visualization result (`out/image`). + +## Onnx model + +you can download yolov10m.onnx from [releases](https://github.com/THU-MIG/yolov10/releases) + +## Label file + +This file represents the correspondence between class index (integer outputted from YOLOV10 network) and +class label (strings making understanding easier). This package maps class IDs (incremented from 0) +with labels according to the order in this file. + +currently, this file is actually a coco label which contains the following labels: + +``` +person +bicycle +car +motorcycle +airplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +couch +potted plant +bed +dining table +toilet +tv +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +``` + +## Reference repositories + +- + +## Legal Notice + +The inference code is licensed under Apache 2.0. The model and training code are licensed under AGPL-3.0. you can check details from . To inquire about a commercial license when using trained model weights please contact yolov10 author. diff --git a/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/calibrator.hpp b/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/calibrator.hpp new file mode 100644 index 0000000000000..a3e1fd0e257fb --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/calibrator.hpp @@ -0,0 +1,493 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef AUTOWARE__TENSORRT_YOLOV10__CALIBRATOR_HPP_ +#define AUTOWARE__TENSORRT_YOLOV10__CALIBRATOR_HPP_ + +#include "cuda_utils/cuda_check_error.hpp" +#include "cuda_utils/cuda_unique_ptr.hpp" + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace autoware::tensorrt_yolov10 +{ +class ImageStream +{ +public: + ImageStream( + int batch_size, nvinfer1::Dims input_dims, const std::vector calibration_images) + : batch_size_(batch_size), + calibration_images_(calibration_images), + current_batch_(0), + max_batches_(calibration_images.size() / batch_size_), + input_dims_(input_dims) + { + batch_.resize(batch_size_ * input_dims_.d[1] * input_dims_.d[2] * input_dims_.d[3]); + } + + int getBatchSize() const { return batch_size_; } + + int getMaxBatches() const { return max_batches_; } + + float * getBatch() { return &batch_[0]; } + + nvinfer1::Dims getInputDims() { return input_dims_; } + + /** + * @brief Preprocess in calibration + * @param[in] images input images + * @param[in] input_dims input dimensions + * @param[in] norm normalization (0.0-1.0) + * @return vector preprocessed data + */ + std::vector preprocess( + const std::vector & images, nvinfer1::Dims input_dims, double norm) + { + std::vector input_h_; + const auto batch_size = images.size(); + input_dims.d[0] = batch_size; + const float input_height = static_cast(input_dims.d[2]); + const float input_width = static_cast(input_dims.d[3]); + std::vector dst_images; + std::vector scales_; + scales_.clear(); + for (const auto & image : images) { + cv::Mat dst_image; + const float scale = std::min(input_width / image.cols, input_height / image.rows); + scales_.emplace_back(scale); + const auto scale_size = cv::Size(image.cols * scale, image.rows * scale); + cv::resize(image, dst_image, scale_size, 0, 0, cv::INTER_CUBIC); + const auto bottom = input_height - dst_image.rows; + const auto right = input_width - dst_image.cols; + copyMakeBorder( + dst_image, dst_image, 0, bottom, 0, right, cv::BORDER_CONSTANT, {114, 114, 114}); + dst_images.emplace_back(dst_image); + } + const auto chw_images = + cv::dnn::blobFromImages(dst_images, norm, cv::Size(), cv::Scalar(), false, false, CV_32F); + + const auto data_length = chw_images.total(); + input_h_.reserve(data_length); + const auto flat = chw_images.reshape(1, data_length); + input_h_ = chw_images.isContinuous() ? flat : flat.clone(); + return input_h_; + } + + /** + * @brief Decode data in calibration + * @param[in] scale normalization (0.0-1.0) + * @return bool succ or fail + */ + bool next(double scale) + { + if (current_batch_ == max_batches_) { + return false; + } + + for (int i = 0; i < batch_size_; ++i) { + auto image = + cv::imread(calibration_images_[batch_size_ * current_batch_ + i].c_str(), cv::IMREAD_COLOR); + std::cout << current_batch_ << " " << i << " Preprocess " + << calibration_images_[batch_size_ * current_batch_ + i].c_str() << std::endl; + auto input = preprocess({image}, input_dims_, scale); + batch_.insert( + batch_.begin() + i * input_dims_.d[1] * input_dims_.d[2] * input_dims_.d[3], input.begin(), + input.end()); + } + + ++current_batch_; + return true; + } + + /** + * @brief Reset calibration + */ + void reset() { current_batch_ = 0; } + +private: + int batch_size_; + std::vector calibration_images_; + int current_batch_; + int max_batches_; + nvinfer1::Dims input_dims_; + std::vector batch_; +}; + +/**Percentile calibration using legacy calibrator*/ +/** + * @class Int8LegacyCalibrator + * @brief Calibrator for Percentile + * @warning We are confirming bug on Tegra like Xavier and Orin. We recommend use MinMax calibrator + */ +class Int8LegacyCalibrator : public nvinfer1::IInt8LegacyCalibrator +{ +public: + Int8LegacyCalibrator( + ImageStream & stream, const std::string calibration_cache_file, + const std::string histogram_cache_file, double scale = 1.0, bool read_cache = true, + double quantile = 0.999999, double cutoff = 0.999999) + : stream_(stream), + calibration_cache_file_(calibration_cache_file), + histogram_cache_file_(histogram_cache_file), + read_cache_(read_cache) + { + auto d = stream_.getInputDims(); + input_count_ = stream_.getBatchSize() * d.d[1] * d.d[2] * d.d[3]; + CHECK_CUDA_ERROR(cudaMalloc(&device_input_, input_count_ * sizeof(float))); + scale_ = scale; + quantile_ = quantile; + cutoff_ = cutoff; + auto algType = getAlgorithm(); + switch (algType) { + case (nvinfer1::CalibrationAlgoType::kLEGACY_CALIBRATION): + std::cout << "CalibrationAlgoType : kLEGACY_CALIBRATION" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION): + std::cout << "CalibrationAlgoType : kENTROPY_CALIBRATION" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION_2): + std::cout << "CalibrationAlgoType : kENTROPY_CALIBRATION_2" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kMINMAX_CALIBRATION): + std::cout << "CalibrationAlgoType : kMINMAX_CALIBRATION" << std::endl; + break; + default: + std::cout << "No CalibrationAlgType" << std::endl; + break; + } + } + int getBatchSize() const noexcept override { return stream_.getBatchSize(); } + + virtual ~Int8LegacyCalibrator() { CHECK_CUDA_ERROR(cudaFree(device_input_)); } + + bool getBatch(void * bindings[], const char * names[], int nb_bindings) noexcept override + { + (void)names; + (void)nb_bindings; + + if (!stream_.next(scale_)) { + return false; + } + try { + CHECK_CUDA_ERROR(cudaMemcpy( + device_input_, stream_.getBatch(), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); + } catch (const std::exception & e) { + // Do nothing + } + bindings[0] = device_input_; + return true; + } + + const void * readCalibrationCache(size_t & length) noexcept override + { + calib_cache_.clear(); + std::ifstream input(calibration_cache_file_, std::ios::binary); + input >> std::noskipws; + if (read_cache_ && input.good()) { + std::copy( + std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(calib_cache_)); + } + + length = calib_cache_.size(); + if (length) { + std::cout << "Using cached calibration table to build the engine" << std::endl; + } else { + std::cout << "New calibration table will be created to build the engine" << std::endl; + } + return length ? &calib_cache_[0] : nullptr; + } + + void writeCalibrationCache(const void * cache, size_t length) noexcept override + { + std::ofstream output(calibration_cache_file_, std::ios::binary); + output.write(reinterpret_cast(cache), length); + } + + double getQuantile() const noexcept + { + printf("Quantile %f\n", quantile_); + return quantile_; + } + + double getRegressionCutoff(void) const noexcept + { + printf("Cutoff %f\n", cutoff_); + return cutoff_; + } + + const void * readHistogramCache(std::size_t & length) noexcept + { + hist_cache_.clear(); + std::ifstream input(histogram_cache_file_, std::ios::binary); + input >> std::noskipws; + if (read_cache_ && input.good()) { + std::copy( + std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(hist_cache_)); + } + + length = hist_cache_.size(); + if (length) { + std::cout << "Using cached histogram table to build the engine" << std::endl; + } else { + std::cout << "New histogram table will be created to build the engine" << std::endl; + } + return length ? &hist_cache_[0] : nullptr; + } + void writeHistogramCache(void const * ptr, std::size_t length) noexcept + { + std::ofstream output(histogram_cache_file_, std::ios::binary); + output.write(reinterpret_cast(ptr), length); + } + +private: + ImageStream stream_; + const std::string calibration_cache_file_; + const std::string histogram_cache_file_; + bool read_cache_{true}; + size_t input_count_; + void * device_input_{nullptr}; + std::vector calib_cache_; + std::vector hist_cache_; + double scale_; + double quantile_; + double cutoff_; +}; + +/** + * @class Int8LegacyCalibrator + * @brief Calibrator for Percentile + * @warning This calibrator causes crucial accuracy drop for YOLOV10. + */ +class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + Int8EntropyCalibrator( + ImageStream & stream, const std::string calibration_cache_file, double scale = 1.0, + bool read_cache = true) + : stream_(stream), calibration_cache_file_(calibration_cache_file), read_cache_(read_cache) + { + auto d = stream_.getInputDims(); + input_count_ = stream_.getBatchSize() * d.d[1] * d.d[2] * d.d[3]; + CHECK_CUDA_ERROR(cudaMalloc(&device_input_, input_count_ * sizeof(float))); + scale_ = scale; + auto algType = getAlgorithm(); + switch (algType) { + case (nvinfer1::CalibrationAlgoType::kLEGACY_CALIBRATION): + std::cout << "CalibrationAlgoType : kLEGACY_CALIBRATION" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION): + std::cout << "CalibrationAlgoType : kENTROPY_CALIBRATION" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION_2): + std::cout << "CalibrationAlgoType : kENTROPY_CALIBRATION_2" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kMINMAX_CALIBRATION): + std::cout << "CalibrationAlgoType : kMINMAX_CALIBRATION" << std::endl; + break; + default: + std::cout << "No CalibrationAlgType" << std::endl; + break; + } + } + int getBatchSize() const noexcept override { return stream_.getBatchSize(); } + + virtual ~Int8EntropyCalibrator() { CHECK_CUDA_ERROR(cudaFree(device_input_)); } + + bool getBatch(void * bindings[], const char * names[], int nb_bindings) noexcept override + { + (void)names; + (void)nb_bindings; + + if (!stream_.next(scale_)) { + return false; + } + try { + CHECK_CUDA_ERROR(cudaMemcpy( + device_input_, stream_.getBatch(), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); + } catch (const std::exception & e) { + // Do nothing + } + bindings[0] = device_input_; + return true; + } + + const void * readCalibrationCache(size_t & length) noexcept override + { + calib_cache_.clear(); + std::ifstream input(calibration_cache_file_, std::ios::binary); + input >> std::noskipws; + if (read_cache_ && input.good()) { + std::copy( + std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(calib_cache_)); + } + + length = calib_cache_.size(); + if (length) { + std::cout << "Using cached calibration table to build the engine" << std::endl; + } else { + std::cout << "New calibration table will be created to build the engine" << std::endl; + } + return length ? &calib_cache_[0] : nullptr; + } + + void writeCalibrationCache(const void * cache, size_t length) noexcept override + { + std::ofstream output(calibration_cache_file_, std::ios::binary); + output.write(reinterpret_cast(cache), length); + } + +private: + ImageStream stream_; + const std::string calibration_cache_file_; + bool read_cache_{true}; + size_t input_count_; + void * device_input_{nullptr}; + std::vector calib_cache_; + std::vector hist_cache_; + double scale_; +}; + +/** + * @class Int8MinMaxCalibrator + * @brief Calibrator for MinMax + * @warning We strongly recommend MinMax calibrator for YOLOV10 + */ +class Int8MinMaxCalibrator : public nvinfer1::IInt8MinMaxCalibrator +{ +public: + Int8MinMaxCalibrator( + ImageStream & stream, const std::string calibration_cache_file, double scale = 1.0, + bool read_cache = true) + : stream_(stream), calibration_cache_file_(calibration_cache_file), read_cache_(read_cache) + { + auto d = stream_.getInputDims(); + input_count_ = stream_.getBatchSize() * d.d[1] * d.d[2] * d.d[3]; + CHECK_CUDA_ERROR(cudaMalloc(&device_input_, input_count_ * sizeof(float))); + scale_ = scale; + auto algType = getAlgorithm(); + switch (algType) { + case (nvinfer1::CalibrationAlgoType::kLEGACY_CALIBRATION): + std::cout << "CalibrationAlgoType : kLEGACY_CALIBRATION" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION): + std::cout << "CalibrationAlgoType : kENTROPY_CALIBRATION" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION_2): + std::cout << "CalibrationAlgoType : kENTROPY_CALIBRATION_2" << std::endl; + break; + case (nvinfer1::CalibrationAlgoType::kMINMAX_CALIBRATION): + std::cout << "CalibrationAlgoType : kMINMAX_CALIBRATION" << std::endl; + break; + default: + std::cout << "No CalibrationAlgType" << std::endl; + break; + } + } + int getBatchSize() const noexcept override { return stream_.getBatchSize(); } + + virtual ~Int8MinMaxCalibrator() { CHECK_CUDA_ERROR(cudaFree(device_input_)); } + + bool getBatch(void * bindings[], const char * names[], int nb_bindings) noexcept override + { + (void)names; + (void)nb_bindings; + + if (!stream_.next(scale_)) { + return false; + } + try { + CHECK_CUDA_ERROR(cudaMemcpy( + device_input_, stream_.getBatch(), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); + } catch (const std::exception & e) { + // Do nothing + } + bindings[0] = device_input_; + return true; + } + + const void * readCalibrationCache(size_t & length) noexcept override + { + calib_cache_.clear(); + std::ifstream input(calibration_cache_file_, std::ios::binary); + input >> std::noskipws; + if (read_cache_ && input.good()) { + std::copy( + std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(calib_cache_)); + } + + length = calib_cache_.size(); + if (length) { + std::cout << "Using cached calibration table to build the engine" << std::endl; + } else { + std::cout << "New calibration table will be created to build the engine" << std::endl; + } + return length ? &calib_cache_[0] : nullptr; + } + + void writeCalibrationCache(const void * cache, size_t length) noexcept override + { + std::ofstream output(calibration_cache_file_, std::ios::binary); + output.write(reinterpret_cast(cache), length); + } + +private: + ImageStream stream_; + const std::string calibration_cache_file_; + bool read_cache_{true}; + size_t input_count_; + void * device_input_{nullptr}; + std::vector calib_cache_; + std::vector hist_cache_; + double scale_; +}; +} // namespace autoware::tensorrt_yolov10 + +#endif // AUTOWARE__TENSORRT_YOLOV10__CALIBRATOR_HPP_ diff --git a/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/preprocess.hpp b/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/preprocess.hpp new file mode 100644 index 0000000000000..e1e47edc9d996 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/preprocess.hpp @@ -0,0 +1,202 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_YOLOV10__PREPROCESS_HPP_ +#define AUTOWARE__TENSORRT_YOLOV10__PREPROCESS_HPP_ + +#include +#include +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_yolox +{ +struct Roi +{ + int x; + int y; + int w; + int h; +}; + +/** + * @brief Resize a image using bilinear interpolation on gpus + * @param[out] dst Resized image + * @param[in] src image + * @param[in] d_w width for resized image + * @param[in] d_h height for resized image + * @param[in] d_c channel for resized image + * @param[in] s_w width for input image + * @param[in] s_h height for input image + * @param[in] s_c channel for input image + * @param[in] stream cuda stream + */ +extern void resize_bilinear_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, + cudaStream_t stream); + +/** + * @brief Letterbox a image on gpus + * @param[out] dst letterbox-ed image + * @param[in] src image + * @param[in] d_w width for letterbox-ing + * @param[in] d_h height for letterbox-ing + * @param[in] d_c channel for letterbox-ing + * @param[in] s_w width for input image + * @param[in] s_h height for input image + * @param[in] s_c channel for input image + * @param[in] stream cuda stream + */ +extern void letterbox_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, + cudaStream_t stream); + +/** + * @brief NHWC to NHWC conversion + * @param[out] dst converted image + * @param[in] src image + * @param[in] d_w width for a image + * @param[in] d_h height for a image + * @param[in] d_c channel for a image + * @param[in] stream cuda stream + */ +extern void nchw_to_nhwc_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream); + +/** + * @brief Unsigned char to float32 for inference + * @param[out] dst32 converted image + * @param[in] src image + * @param[in] d_w width for a image + * @param[in] d_h height for a image + * @param[in] d_c channel for a image + * @param[in] stream cuda stream + */ +extern void to_float_gpu( + float * dst32, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream); + +/** + * @brief Resize and letterbox a image using bilinear interpolation on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, + cudaStream_t stream); + +/** + * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization + * for YOLOX on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] norm normalization + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_nhwc_to_nchw32_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, + float norm, cudaStream_t stream); + +/** + * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization + * with batching for YOLOX on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] batch batch size + * @param[in] norm normalization + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, int batch, + float norm, cudaStream_t stream); + +/** + * @brief Optimized preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat and + * normalization with batching for YOLOX on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] d_roi regions of interest for cropping + * @param[in] batch batch size + * @param[in] norm normalization + * @param[in] stream cuda stream + */ +extern void crop_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h, + int s_c, int batch, float norm, cudaStream_t stream); + +/** + * @brief Optimized multi-scale preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat + * and normalization with batching for YOLOX on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] d_roi regions of interest for cropping + * @param[in] batch batch size + * @param[in] norm normalization + * @param[in] stream cuda stream + */ +extern void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h, + int s_c, int batch, float norm, cudaStream_t stream); + +/** + * @brief Argmax on GPU + * @param[out] dst processed image + * @param[in] src probability map + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] batch batch size + * @param[in] stream cuda stream + */ +extern void argmax_gpu( + unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch, + cudaStream_t stream); +} // namespace tensorrt_yolox +} // namespace autoware +#endif // AUTOWARE__TENSORRT_YOLOV10__PREPROCESS_HPP_ diff --git a/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/tensorrt_yolov10.hpp b/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/tensorrt_yolov10.hpp new file mode 100644 index 0000000000000..6e6de8c6121ab --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/tensorrt_yolov10.hpp @@ -0,0 +1,143 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_YOLOV10__TENSORRT_YOLOV10_HPP_ +#define AUTOWARE__TENSORRT_YOLOV10__TENSORRT_YOLOV10_HPP_ + +#include +#include +#include +#include + +#include +#include +#include + +namespace autoware::tensorrt_yolov10 +{ +using cuda_utils::CudaUniquePtr; +using cuda_utils::CudaUniquePtrHost; +using cuda_utils::makeCudaStream; +using cuda_utils::StreamUniquePtr; + +struct Object +{ + int32_t x_offset; + int32_t y_offset; + int32_t height; + int32_t width; + float score; + int32_t type; +}; + +using ObjectArray = std::vector; +using ObjectArrays = std::vector; + +class TrtYolov10 +{ +public: + TrtYolov10( + const std::string & model_path, const std::string & precision, const int num_class = 8, + const float score_threshold = 0.8, + const tensorrt_common::BuildConfig build_config = tensorrt_common::BuildConfig(), + const bool use_gpu_preprocess = false, const uint8_t gpu_id = 0, + std::string calibration_image_list_file = std::string(), const double norm_factor = 1.0, + [[maybe_unused]] const std::string & cache_dir = "", + const tensorrt_common::BatchConfig & batch_config = {1, 1, 1}, + const size_t max_workspace_size = (1 << 30)); + + ~TrtYolov10(); + + bool setCudaDeviceId(const uint8_t gpu_id); + + /** + * @brief return a flag for gpu initialization + */ + bool isGPUInitialized() const { return is_gpu_initialized_; } + + /** + * @brief run inference including pre-process and post-process + * @param[out] objects results for object detection + * @param[in] images batched images + */ + bool doInference(const std::vector & images, ObjectArrays & objects); + + // /** + // * @brief allocate buffer for preprocess on GPU + // * @param[in] width original image width + // * @param[in] height original image height + // * @warning if we don't allocate buffers using it, "preprocessGpu" allocates buffers at the + // * beginning + // */ + // void initPreprocessBuffer(int width, int height); + + // /** + // * @brief output TensorRT profiles for each layer + // */ + // void printProfiling(void); + + void preProcess(cv::Mat * img, int length, float * factor, std::vector & data); + + void preprocess(const std::vector & images, std::vector & data); + ObjectArray postprocess(float * result, float factor); + + // /** + // * @brief run preprocess on GPU + // * @param[in] images batching images + // */ + // void preprocessGpu(const std::vector & images); + + bool feedforward(const std::vector & images, ObjectArrays & objects); + + std::unique_ptr trt_common_; + + std::vector input_h_; + CudaUniquePtr input_d_; + + size_t out_elem_num_; + size_t out_elem_num_per_batch_; + CudaUniquePtr out_d_; + CudaUniquePtrHost out_h_; + StreamUniquePtr stream_; + + int32_t max_detections_; + std::vector scales_; + + int num_class_; + float score_threshold_; + int batch_size_; + + // GPU id for inference + const uint8_t gpu_id_; + // flag for gpu initialization + bool is_gpu_initialized_; + // host buffer for preprocessing on GPU + CudaUniquePtrHost image_buf_h_; + // device buffer for preprocessing on GPU + CudaUniquePtr image_buf_d_; + + int src_width_; + int src_height_; + + CudaUniquePtr argmax_buf_d_; + double norm_factor_; + int multitask_; + bool use_gpu_preprocess_; + + float factor_ = 1.0; +}; + +} // namespace autoware::tensorrt_yolov10 + +#endif // AUTOWARE__TENSORRT_YOLOV10__TENSORRT_YOLOV10_HPP_ diff --git a/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/tensorrt_yolov10_node.hpp b/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/tensorrt_yolov10_node.hpp new file mode 100644 index 0000000000000..f869ee117db25 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/include/autoware/tensorrt_yolov10/tensorrt_yolov10_node.hpp @@ -0,0 +1,76 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_YOLOV10__TENSORRT_YOLOV10_NODE_HPP_ +#define AUTOWARE__TENSORRT_YOLOV10__TENSORRT_YOLOV10_NODE_HPP_ + +#include "autoware/object_recognition_utils/object_recognition_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if __has_include() +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +namespace autoware::tensorrt_yolov10 +{ +// cspell: ignore Semseg +class TrtYolov10Node : public rclcpp::Node +{ +public: + explicit TrtYolov10Node(const rclcpp::NodeOptions & node_options); + +private: + void onConnect(); + void onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg); + + bool readLabelFile(const std::string & label_path); + void replaceLabelMap(); + + std::unique_ptr trt_yolov10_; + + image_transport::Subscriber image_sub_; + + rclcpp::TimerBase::SharedPtr timer_; + std::unique_ptr> stop_watch_ptr_; + std::unique_ptr debug_publisher_; + rclcpp::Publisher::SharedPtr objects_pub_; + image_transport::Publisher image_pub_; + + std::map label_map_; +}; + +} // namespace autoware::tensorrt_yolov10 + +#endif // AUTOWARE__TENSORRT_YOLOV10__TENSORRT_YOLOV10_NODE_HPP_ diff --git a/perception/autoware_tensorrt_yolov10/launch/yolov10.launch.xml b/perception/autoware_tensorrt_yolov10/launch/yolov10.launch.xml new file mode 100644 index 0000000000000..dd1b020847437 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/launch/yolov10.launch.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/perception/autoware_tensorrt_yolov10/launch/yolov10_single_image.launch.xml b/perception/autoware_tensorrt_yolov10/launch/yolov10_single_image.launch.xml new file mode 100644 index 0000000000000..ab8a0f8c645e6 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/launch/yolov10_single_image.launch.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/perception/autoware_tensorrt_yolov10/package.xml b/perception/autoware_tensorrt_yolov10/package.xml new file mode 100644 index 0000000000000..76b161e50cbd6 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/package.xml @@ -0,0 +1,41 @@ + + + autoware_tensorrt_yolov10 + 0.0.1 + tensorrt library implementation for yolov10 + + Su Chang + Su Chang + + Apache License 2.0 + + ament_cmake_auto + autoware_cmake + cudnn_cmake_module + tensorrt_cmake_module + + cudnn_cmake_module + tensorrt_cmake_module + + autoware_object_recognition_utils + autoware_perception_msgs + autoware_tensorrt_common + cuda_utils + cv_bridge + image_transport + libopencv-dev + perception_utils + rclcpp + rclcpp_components + sensor_msgs + tier4_perception_msgs + + autoware_image_transport_decompressor + + ament_lint_auto + autoware_lint_common + + + ament_cmake + + diff --git a/perception/autoware_tensorrt_yolov10/src/tensorrt_yolov10.cpp b/perception/autoware_tensorrt_yolov10/src/tensorrt_yolov10.cpp new file mode 100644 index 0000000000000..5106edc9cc6e5 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/src/tensorrt_yolov10.cpp @@ -0,0 +1,397 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cuda_utils/cuda_check_error.hpp" +#include "cuda_utils/cuda_unique_ptr.hpp" + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PRINT_DEBUG_INFO printf("line:%d\n", __LINE__); + +namespace +{ +static void trimLeft(std::string & s) +{ + s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) { return !isspace(ch); })); +} + +static void trimRight(std::string & s) +{ + s.erase(find_if(s.rbegin(), s.rend(), [](int ch) { return !isspace(ch); }).base(), s.end()); +} + +std::string trim(std::string & s) +{ + trimLeft(s); + trimRight(s); + return s; +} + +bool fileExists(const std::string & file_name, bool verbose) +{ + if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(file_name))) { + if (verbose) { + std::cout << "File does not exist : " << file_name << std::endl; + } + return false; + } + return true; +} + +std::vector loadListFromTextFile(const std::string & filename) +{ + assert(fileExists(filename, true)); + std::vector list; + + std::ifstream f(filename); + if (!f) { + std::cout << "failed to open " << filename << std::endl; + assert(0); + } + + std::string line; + while (std::getline(f, line)) { + if (line.empty()) { + continue; + } else { + list.push_back(trim(line)); + } + } + + return list; +} + +std::vector loadImageList(const std::string & filename, const std::string & prefix) +{ + std::vector fileList = loadListFromTextFile(filename); + for (auto & file : fileList) { + if (fileExists(file, false)) { + continue; + } else { + std::string prefixed = prefix + file; + if (fileExists(prefixed, false)) + file = prefixed; + else + std::cerr << "WARNING: couldn't find: " << prefixed << " while loading: " << filename + << std::endl; + } + } + return fileList; +} + +} // anonymous namespace + +namespace autoware::tensorrt_yolov10 +{ +TrtYolov10::TrtYolov10( + const std::string & model_path, const std::string & precision, const int num_class, + const float score_threshold, const tensorrt_common::BuildConfig build_config, + const bool use_gpu_preprocess, const uint8_t gpu_id, std::string calibration_image_list_path, + const double norm_factor, [[maybe_unused]] const std::string & cache_dir, + const tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size) +: gpu_id_(gpu_id), is_gpu_initialized_(false) +{ + if (!setCudaDeviceId(gpu_id_)) { + return; + } + is_gpu_initialized_ = true; + + num_class_ = num_class; + score_threshold_ = score_threshold; + + src_width_ = -1; + src_height_ = -1; + norm_factor_ = norm_factor; + batch_size_ = batch_config[2]; + multitask_ = 0; + stream_ = makeCudaStream(); + + if (precision == "int8") { + if (build_config.clip_value <= 0.0) { + if (calibration_image_list_path.empty()) { + throw std::runtime_error( + "calibration_image_list_path should be passed to generate int8 engine " + "or specify values larger than zero to clip_value."); + } + } else { + // if clip value is larger than zero, calibration file is not needed + calibration_image_list_path = ""; + } + + int max_batch_size = batch_size_; + nvinfer1::Dims input_dims = tensorrt_common::get_input_dims(model_path); + std::vector calibration_images; + if (calibration_image_list_path != "") { + calibration_images = loadImageList(calibration_image_list_path, ""); + } + tensorrt_yolov10::ImageStream stream(max_batch_size, input_dims, calibration_images); + fs::path calibration_table{model_path}; + std::string ext = ""; + if (build_config.calib_type_str == "Entropy") { + ext = "EntropyV2-"; + } else if ( + build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + ext = "Legacy-"; + } else { + ext = "MinMax-"; + } + + ext += "calibration.table"; + calibration_table.replace_extension(ext); + fs::path histogram_table{model_path}; + ext = "histogram.table"; + histogram_table.replace_extension(ext); + + std::unique_ptr calibrator; + if (build_config.calib_type_str == "Entropy") { + calibrator.reset( + new tensorrt_yolov10::Int8EntropyCalibrator(stream, calibration_table, norm_factor_)); + + } else if ( + build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + const double quantile = 0.999999; + const double cutoff = 0.999999; + calibrator.reset(new tensorrt_yolov10::Int8LegacyCalibrator( + stream, calibration_table, histogram_table, norm_factor_, true, quantile, cutoff)); + } else { + calibrator.reset( + new tensorrt_yolov10::Int8MinMaxCalibrator(stream, calibration_table, norm_factor_)); + } + trt_common_ = std::make_unique( + model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config); + } else { + trt_common_ = std::make_unique( + model_path, precision, nullptr, batch_config, max_workspace_size, build_config); + } + trt_common_->setup(); + + if (!trt_common_->isInitialized()) { + return; + } + num_class_ = num_class; + score_threshold_ = score_threshold; + + // GPU memory allocation + const auto input_dims = trt_common_->getBindingDimensions(0); + const auto input_size = + std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, 1, std::multiplies()); + + const auto output_dims = trt_common_->getBindingDimensions(1); + input_d_ = cuda_utils::make_unique(batch_config[2] * input_size); + max_detections_ = output_dims.d[1]; + out_elem_num_ = std::accumulate( + output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); + out_elem_num_ = out_elem_num_ * batch_config[2]; + out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_config[2]); + out_d_ = cuda_utils::make_unique(out_elem_num_); + out_h_ = cuda_utils::make_unique_host(out_elem_num_, cudaHostAllocPortable); + + if (use_gpu_preprocess) { + use_gpu_preprocess_ = true; + image_buf_h_ = nullptr; + image_buf_d_ = nullptr; + } else { + use_gpu_preprocess_ = false; + } +} + +TrtYolov10::~TrtYolov10() +{ + if (use_gpu_preprocess_) { + if (image_buf_h_) { + image_buf_h_.reset(); + } + if (image_buf_d_) { + image_buf_d_.reset(); + } + if (argmax_buf_d_) { + argmax_buf_d_.reset(); + } + } +} + +bool TrtYolov10::setCudaDeviceId(const uint8_t gpu_id) +{ + cudaError_t cuda_err = cudaSetDevice(gpu_id); + if (cuda_err != cudaSuccess) { + return false; + } else { + return true; + } +} + +void TrtYolov10::preProcess(cv::Mat * img, int length, float * factor, std::vector & data) +{ + // Create a new cv::Mat object for storing the image after conversion + cv::Mat mat; + + // Get the dimensions and number of channels of the input image + int rh = img->rows; // Height of the input image + int rw = img->cols; // Width of the input image + + // Convert the input image from BGR to RGB color space + cv::cvtColor(*img, mat, cv::COLOR_BGR2RGB); + + // Determine the size of the new square image (largest dimension of the input image) + int maxImageLength = rw > rh ? rw : rh; + + cv::Mat maxImage = cv::Mat::zeros(maxImageLength, maxImageLength, CV_8UC3); + + // Set all pixels to 255 (white) + maxImage = maxImage * 255; + + // Define a Region of Interest (ROI) that covers the entire original image + cv::Rect roi(0, 0, rw, rh); + + // Copy the original image into the ROI of the new square image + mat.copyTo(cv::Mat(maxImage, roi)); + + // Create a new cv::Mat object for storing the resized image + cv::Mat resizeImg; + + // Resize the square image to the specified dimensions (length x length) + cv::resize(maxImage, resizeImg, cv::Size(length, length), 0.0f, 0.0f, cv::INTER_LINEAR); + + // Calculate the scaling factor and store it in the 'factor' variable + *factor = (1.0 * maxImageLength) / (1.0 * length); + + // Convert the resized image to floating-point format with values in range [0, 1] + resizeImg.convertTo(resizeImg, CV_32FC3, 1 / 255.0); + + // Update the height, width, and number of channels for the resized image + rh = resizeImg.rows; + rw = resizeImg.cols; + int rc = resizeImg.channels(); + + // Extract each channel of the resized image and store it in the 'data' vector + for (int i = 0; i < rc; ++i) { + // Extract the i-th channel and store it in the appropriate part of the 'data' vector + cv::extractChannel(resizeImg, cv::Mat(rh, rw, CV_32FC1, data.data() + i * rh * rw), i); + } +} + +void TrtYolov10::preprocess(const std::vector & images, std::vector & inputData) +{ + // todo: support multi images. now assume that infer only one image. + for (cv::Mat image : images) { + int length = 640; // 模型输入的大小 + factor_ = 1.0; + + preProcess(&image, length, &factor_, inputData); + + // // check sum + // float sum = std::accumulate(inputData.begin(), inputData.end(), 0.0f); + // printf("sum=%.2f\n", sum); + } +} + +bool TrtYolov10::doInference(const std::vector & images, ObjectArrays & objects) +{ + if (!setCudaDeviceId(gpu_id_)) { + return false; + } + + if (!trt_common_->isInitialized()) { + return false; + } + + std::vector input_data(640 * 640 * 3); + preprocess(images, input_data); + CHECK_CUDA_ERROR(cudaMemcpy( + input_d_.get(), input_data.data(), 3 * 640 * 640 * sizeof(float), cudaMemcpyHostToDevice)); + + return feedforward(images, objects); +} + +bool TrtYolov10::feedforward(const std::vector & images, ObjectArrays & objects) +{ + // PRINT_DEBUG_INFO + std::vector buffers = {input_d_.get(), out_d_.get()}; + + trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + + // printf("out_elem_num_:%d\n", (int)out_elem_num_); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_h_.get(), out_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost, *stream_)); + + cudaStreamSynchronize(*stream_); + objects.clear(); + + const auto batch_size = images.size(); + for (size_t i = 0; i < batch_size; ++i) { + // auto image_size = images[i].size(); + float * batch_prob = out_h_.get() + (i * out_elem_num_per_batch_); + + ObjectArray object_array = postprocess(batch_prob, factor_); + objects.emplace_back(object_array); + } + + return true; +} + +ObjectArray TrtYolov10::postprocess(float * result, float factor) +{ + ObjectArray object_array; + + // yolov10m out: 1 x 300 x 6 + for (int i = 0; i < max_detections_; i++) { + int s = 6 * i; + + float score = result[s + 4]; + // printf("i:%d,score:%.3f\n",i,(float)result[s + 4]); + + if (score > score_threshold_) { + // printf("i:%d,score:%.3f\n", i, (float)result[s + 4]); + + float ltx = result[s + 0]; + float lty = result[s + 1]; + float rbx = result[s + 2]; + float rby = result[s + 3]; + + int x = static_cast((ltx)*factor); + int y = static_cast((lty)*factor); + int width = static_cast((rbx - ltx) * factor); + int height = static_cast((rby - lty) * factor); + + Object object; + object.x_offset = x; + object.y_offset = y; + object.width = width; + object.height = height; + object.score = score; + object.type = static_cast(result[s + 5]); + object_array.emplace_back(object); + } + } + + return object_array; +} + +} // namespace autoware::tensorrt_yolov10 diff --git a/perception/autoware_tensorrt_yolov10/src/tensorrt_yolov10_node.cpp b/perception/autoware_tensorrt_yolov10/src/tensorrt_yolov10_node.cpp new file mode 100644 index 0000000000000..6d173876b9840 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/src/tensorrt_yolov10_node.cpp @@ -0,0 +1,197 @@ +// Copyright 2022 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/tensorrt_yolov10/tensorrt_yolov10_node.hpp" + +#include "autoware/object_recognition_utils/object_classification.hpp" +#include "perception_utils/run_length_encoder.hpp" + +#include + +#include +#include +#include +#include +#include + +#define PRINT_DEBUG_INFO printf("line:%d\n", __LINE__); + +namespace autoware::tensorrt_yolov10 +{ +TrtYolov10Node::TrtYolov10Node(const rclcpp::NodeOptions & node_options) +: Node("tensorrt_yolov10", node_options) +{ + { + stop_watch_ptr_ = + std::make_unique>(); + debug_publisher_ = + std::make_unique(this, this->get_name()); + stop_watch_ptr_->tic("cyclic_time"); + stop_watch_ptr_->tic("processing_time"); + } + using std::placeholders::_1; + using std::chrono_literals::operator""ms; + + const std::string model_path = this->declare_parameter("model_path"); + const std::string precision = this->declare_parameter("precision", "fp16"); + const uint8_t gpu_id = this->declare_parameter("gpu_id", 0); + const std::string label_path = this->declare_parameter("label_path"); + + trt_yolov10_ = std::make_unique(model_path, precision); + + if (!readLabelFile(label_path)) { + RCLCPP_ERROR(this->get_logger(), "Could not find label file"); + rclcpp::shutdown(); + } + replaceLabelMap(); + + if (!trt_yolov10_->isGPUInitialized()) { + RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id); + rclcpp::shutdown(); + return; + } + RCLCPP_INFO(this->get_logger(), "GPU %d is selected for the inference!", gpu_id); + + timer_ = + rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtYolov10Node::onConnect, this)); + + objects_pub_ = this->create_publisher( + "~/out/objects", 1); + image_pub_ = image_transport::create_publisher(this, "~/out/image"); +} + +void TrtYolov10Node::onConnect() +{ + using std::placeholders::_1; + if ( + objects_pub_->get_subscription_count() == 0 && + objects_pub_->get_intra_process_subscription_count() == 0 && + image_pub_.getNumSubscribers() == 0) { + // printf("no subscribers who sub from objects_pub_ or image_pub_,shut down image_sub_\n"); + image_sub_.shutdown(); + } else if (!image_sub_) { + image_sub_ = image_transport::create_subscription( + this, "~/in/image", std::bind(&TrtYolov10Node::onImage, this, _1), "raw", + rmw_qos_profile_sensor_data); + } +} + +void TrtYolov10Node::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg) +{ + stop_watch_ptr_->toc("processing_time", true); + tier4_perception_msgs::msg::DetectedObjectsWithFeature out_objects; + + cv_bridge::CvImagePtr in_image_ptr; + try { + in_image_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::BGR8); + } catch (cv_bridge::Exception & e) { + RCLCPP_ERROR(this->get_logger(), "cv_bridge exception: %s", e.what()); + return; + } + const auto width = in_image_ptr->image.cols; + const auto height = in_image_ptr->image.rows; + + tensorrt_yolov10::ObjectArrays objects; + + if (!trt_yolov10_->doInference({in_image_ptr->image}, objects)) { + RCLCPP_WARN(this->get_logger(), "Fail to inference"); + return; + } + + for (const auto & yolov10_object : objects.at(0)) { + tier4_perception_msgs::msg::DetectedObjectWithFeature object; + object.feature.roi.x_offset = yolov10_object.x_offset; + object.feature.roi.y_offset = yolov10_object.y_offset; + object.feature.roi.width = yolov10_object.width; + object.feature.roi.height = yolov10_object.height; + object.object.existence_probability = yolov10_object.score; + object.object.classification = + object_recognition_utils::toObjectClassifications(label_map_[yolov10_object.type], 1.0f); + out_objects.feature_objects.push_back(object); + const auto left = std::max(0, static_cast(object.feature.roi.x_offset)); + const auto top = std::max(0, static_cast(object.feature.roi.y_offset)); + const auto right = + std::min(static_cast(object.feature.roi.x_offset + object.feature.roi.width), width); + const auto bottom = + std::min(static_cast(object.feature.roi.y_offset + object.feature.roi.height), height); + cv::rectangle( + in_image_ptr->image, cv::Point(left, top), cv::Point(right, bottom), cv::Scalar(0, 0, 255), 3, + 8, 0); + + cv::putText( + in_image_ptr->image, label_map_[yolov10_object.type], cv::Point(left, top), + cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 255, 255), 1, 8, 0); + } + + image_pub_.publish(in_image_ptr->toImageMsg()); + out_objects.header = msg->header; + objects_pub_->publish(out_objects); + + if (debug_publisher_) { + const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); + const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic_time", true); + const double pipeline_latency_ms = + std::chrono::duration( + std::chrono::nanoseconds( + (this->get_clock()->now() - out_objects.header.stamp).nanoseconds())) + .count(); + debug_publisher_->publish( + "debug/cyclic_time_ms", cyclic_time_ms); + debug_publisher_->publish( + "debug/processing_time_ms", processing_time_ms); + debug_publisher_->publish( + "debug/pipeline_latency_ms", pipeline_latency_ms); + } +} + +bool TrtYolov10Node::readLabelFile(const std::string & label_path) +{ + std::ifstream label_file(label_path); + if (!label_file.is_open()) { + RCLCPP_ERROR(this->get_logger(), "Could not open label file. [%s]", label_path.c_str()); + return false; + } + int label_index{}; + std::string label; + while (getline(label_file, label)) { + std::transform( + label.begin(), label.end(), label.begin(), [](auto c) { return std::toupper(c); }); + label_map_.insert({label_index, label}); + ++label_index; + } + return true; +} + +// we need this because autoware::object_recognition_utils::toLabel(const std::string & class_name) +// limit label type +void TrtYolov10Node::replaceLabelMap() +{ + for (std::size_t i = 0; i < label_map_.size(); ++i) { + auto & label = label_map_[i]; + if (label == "PERSON") { + label = "PEDESTRIAN"; + } else if (label == "MOTORBIKE") { + label = "MOTORCYCLE"; + } else if ( + label != "CAR" && label != "PEDESTRIAN" && label != "BUS" && label != "TRUCK" && + label != "BICYCLE" && label != "MOTORCYCLE") { + label = "UNKNOWN"; + } + } +} + +} // namespace autoware::tensorrt_yolov10 + +#include "rclcpp_components/register_node_macro.hpp" +RCLCPP_COMPONENTS_REGISTER_NODE(autoware::tensorrt_yolov10::TrtYolov10Node) diff --git a/perception/autoware_tensorrt_yolov10/src/yolov10_single_image_inference_node.cpp b/perception/autoware_tensorrt_yolov10/src/yolov10_single_image_inference_node.cpp new file mode 100644 index 0000000000000..da896629bec94 --- /dev/null +++ b/perception/autoware_tensorrt_yolov10/src/yolov10_single_image_inference_node.cpp @@ -0,0 +1,84 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#if (defined(_MSC_VER) or (defined(__GNUC__) and (7 <= __GNUC_MAJOR__))) +#include +namespace fs = ::std::filesystem; +#else +#include +namespace fs = ::std::experimental::filesystem; +#endif + +#include +#include + +// using namespace std; + +#define PRINT_DEBUG_INFO printf("line:%d\n", __LINE__); + +namespace autoware::tensorrt_yolov10 +{ +class Yolov10SingleImageInferenceNode : public rclcpp::Node +{ +public: + explicit Yolov10SingleImageInferenceNode(const rclcpp::NodeOptions & node_options) + : Node("yolov10_single_image_inference", node_options) + { + const auto image_path = declare_parameter("image_path", ""); + const auto model_path = declare_parameter("model_path", ""); + const auto precision = declare_parameter("precision", "fp16"); + const auto save_image = declare_parameter("save_image", true); + + printf("image_path:%s\n", image_path.c_str()); + auto p = fs::path(image_path); + const auto ext = p.extension().string(); + p.replace_extension(""); + const auto output_image_path = + declare_parameter("output_image_path", p.string() + "_detect" + ext); + + printf("model_path:%s,output_image_path:%s\n", model_path.c_str(), output_image_path.c_str()); + auto trt_yolov10 = std::make_unique(model_path, precision); + auto image = cv::imread(image_path); + tensorrt_yolov10::ObjectArrays objects; + + trt_yolov10->doInference({image}, objects); + for (const auto & object : objects[0]) { + const auto left = object.x_offset; + const auto top = object.y_offset; + const auto right = std::clamp(left + object.width, 0, image.cols); + const auto bottom = std::clamp(top + object.height, 0, image.rows); + cv::rectangle( + image, cv::Point(left, top), cv::Point(right, bottom), cv::Scalar(0, 0, 255), 3, 8, 0); + + std::string cls_id = std::to_string(object.type); + cv::putText( + image, cls_id, cv::Point(left, top), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 255, 255), + 1, 8, 0); + } + if (!save_image) { + cv::imshow("inference image", image); + cv::waitKey(0); + rclcpp::shutdown(); + } + cv::imwrite(output_image_path, image); + rclcpp::shutdown(); + } +}; +} // namespace autoware::tensorrt_yolov10 + +#include "rclcpp_components/register_node_macro.hpp" +RCLCPP_COMPONENTS_REGISTER_NODE(autoware::tensorrt_yolov10::Yolov10SingleImageInferenceNode)