opencv · fengyuentau · Mar 6, 2024 · Mar 2, 2024 · Mar 2, 2024 · Mar 3, 2024
diff --git a/models/object_tracking_vittrack/CMakeLists.txt b/models/object_tracking_vittrack/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.24)
+set(project_name "opencv_zoo_object_tracking_vittrack")
+
+PROJECT (${project_name})
+
+set(OPENCV_VERSION "4.9.0")
+set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
+find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH})
+# Find OpenCV, you may need to set OpenCV_DIR variable
+# to the absolute path to the directory containing OpenCVConfig.cmake file
+# via the command line or GUI
+
+file(GLOB SourceFile
+    "demo.cpp")
+# If the package has been found, several variables will
+# be set, you can find the full list with descriptions
+# in the OpenCVConfig.cmake file.
+# Print some message showing some of them
+message(STATUS "OpenCV library status:")
+message(STATUS "    config: ${OpenCV_DIR}")
+message(STATUS "    version: ${OpenCV_VERSION}")
+message(STATUS "    libraries: ${OpenCV_LIBS}")
+message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+# Declare the executable target built from your sources
+add_executable(${project_name} ${SourceFile})
+
+# Set C++ compilation standard to C++11
+set(CMAKE_CXX_STANDARD 11)
+
+# Link your application with OpenCV libraries
+target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
diff --git a/models/object_tracking_vittrack/README.md b/models/object_tracking_vittrack/README.md
@@ -11,14 +11,34 @@ This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC
 **NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**
 
 # Demo
-
+## Python
 ```bash
+# tracking on camera input
+python demo.py
+
 # tracking on video
 python demo.py --input /path/to/video
 
 # get help regarding various parameters
 python demo.py --help
 ```
+## C++
+Install latest OpenCV and CMake >= 3.24.0 to get started.
+
+```shell
+# A typical and default installation path of OpenCV is /usr/local
+cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation .
+cmake --build build
+
+# tracking on camera input
+./build/opencv_zoo_object_tracking_vittrack
+
+# tracking on video
+./build/opencv_zoo_object_tracking_vittrack -i=/path/to/video
+
+# get help messages
+./build/opencv_zoo_object_tracking_vittrack -h
+```
 
 # Example outputs
 

diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp
@@ -0,0 +1,210 @@
+#include <iostream>
+#include <opencv2/opencv.hpp>
+
+using namespace std;
+using namespace cv;
+using namespace dnn;
+
+struct TrackingResult
+{
+    bool isLocated;
+    Rect bbox;
+    float score;
+};
+
+class VitTrack
+{
+public:
+
+    VitTrack(const string& model_path, int backend_id = 0, int target_id = 0) 
+    {
+        params.net = model_path;
+        params.backend = backend_id;
+        params.target = target_id;
+        model = TrackerVit::create(params);
+    }
+
+    void init(const Mat& image, const Rect& roi)
+    {
+        model->init(image, roi);
+    }
+
+    TrackingResult infer(const Mat& image)
+    {
+        TrackingResult result;
+        result.isLocated = model->update(image, result.bbox);
+        result.score = model->getTrackingScore();
+        return result;
+    }
+
+private:
+    TrackerVit::Params params;
+    Ptr<TrackerVit> model;
+};
+
+Mat visualize(const Mat& image, const Rect& bbox, float score, bool isLocated, double fps = -1.0,
+                  const Scalar& box_color = Scalar(0, 255, 0), const Scalar& text_color = Scalar(0, 255, 0),
+                  double fontScale = 1.0, int fontSize = 1)
+{
+    Mat output = image.clone();
+    int h = output.rows;
+    int w = output.cols;
+
+    if (fps >= 0)
+    {
+        putText(output, "FPS: " + to_string(fps), Point(0, 30), FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize);
+    }
+
+    if (isLocated && score >= 0.3)
+    {
+        rectangle(output, bbox, box_color, 2);
+        putText(output, format("%.2f", score), Point(bbox.x, bbox.y + 25),
+                    FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize);
+    }
+    else
+    {
+        Size text_size = getTextSize("Target lost!", FONT_HERSHEY_DUPLEX, fontScale, fontSize, nullptr);
+        int text_x = (w - text_size.width) / 2;
+        int text_y = (h - text_size.height) / 2;
+        putText(output, "Target lost!", Point(text_x, text_y), FONT_HERSHEY_DUPLEX, fontScale, Scalar(0, 0, 255), fontSize);
+    }
+
+    return output;
+}
+
+int main(int argc, char** argv)
+{
+    CommandLineParser parser(argc, argv,
+        "{help  h           |                                       | Print help message. }"
+        "{input i           |                                       |Set path to the input video. Omit for using default camera.}"
+        "{model_path        |object_tracking_vittrack_2023sep.onnx  |Set model path}"
+        "{backend_target bt |0                                      |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}"
+        "{save s            |false                                  |Specify to save a file with results.}"
+        "{vis v             |true                                   |Specify to open a new window to show results.}");
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    string input = parser.get<string>("input");
+    string model_path = parser.get<string>("model_path");
+    int backend_target = parser.get<int>("backend_target");
+    bool save = parser.get<bool>("save");
+    bool vis = parser.get<bool>("vis");
+
+    vector<vector<int>> backend_target_pairs =
+    {
+        {DNN_BACKEND_OPENCV, DNN_TARGET_CPU},
+        {DNN_BACKEND_CUDA, DNN_TARGET_CUDA},
+        {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},
+        {DNN_BACKEND_TIMVX, DNN_TARGET_NPU},
+        {DNN_BACKEND_CANN, DNN_TARGET_NPU}
+    };
+
+    int backend_id = backend_target_pairs[backend_target][0];
+    int target_id = backend_target_pairs[backend_target][1];
+
+    // Create VitTrack tracker
+    VitTrack tracker(model_path, backend_id, target_id);
+
+    // Open video capture
+    VideoCapture video;
+    if (input.empty())
+    {
+        video.open(0);  // Default camera
+    }
+    else
+    {
+        video.open(input);
+    }
+
+    if (!video.isOpened())
+    {
+        cerr << "Error: Could not open video source" << endl;
+        return -1;
+    }
+
+    // Select an object
+    Mat first_frame;
+    video >> first_frame;
+
+    if (first_frame.empty())
+    {
+        cerr << "No frames grabbed!" << endl;
+        return -1;
+    }
+
+    Mat first_frame_copy = first_frame.clone();
+    putText(first_frame_copy, "1. Drag a bounding box to track.", Point(0, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0));
+    putText(first_frame_copy, "2. Press ENTER to confirm", Point(0, 50), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0));
+    Rect roi = selectROI("VitTrack Demo", first_frame_copy);
+
+    if (roi.area() == 0)
+    {
+        cerr << "No ROI is selected! Exiting..." << endl;
+        return -1;
+    }
+    else
+    {
+        cout << "Selected ROI: " << roi << endl;
+    }
+
+    // Create VideoWriter if save option is specified
+    VideoWriter output_video;
+    if (save)
+    {
+        Size frame_size = first_frame.size();
+        output_video.open("output.mp4", VideoWriter::fourcc('m', 'p', '4', 'v'), video.get(CAP_PROP_FPS), frame_size);
+        if (!output_video.isOpened())
+        {
+            cerr << "Error: Could not create output video stream" << endl;
+            return -1;
+        }
+    }
+
+    // Initialize tracker with ROI
+    tracker.init(first_frame, roi);
+
+    // Track frame by frame
+    TickMeter tm;
+    while (waitKey(1) < 0)
+    {
+        video >> first_frame;
+        if (first_frame.empty())
+        {
+            cout << "End of video" << endl;
+            break;
+        }
+
+        // Inference
+        tm.start();
+        TrackingResult result = tracker.infer(first_frame);
+        tm.stop();
+
+        // Visualize
+        Mat frame = first_frame.clone();
+        frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS());
+
+        if (save)
+        {
+            output_video.write(frame);
+        }
+
+        if (vis)
+        {
+            imshow("VitTrack Demo", frame);
+        }
+        tm.reset();
+    }
+
+    if (save)
+    {
+        output_video.release();
+    }
+
+    video.release();
+    destroyAllWindows();
+
+    return 0;
+}
diff --git a/models/object_tracking_vittrack/demo.py b/models/object_tracking_vittrack/demo.py
@@ -35,12 +35,11 @@
                         {:d}: TIM-VX + NPU,
                         {:d}: CANN + NPU
                     '''.format(*[x for x in range(len(backend_target_pairs))]))
-parser.add_argument('--save', '-s', action='store_true',
-                    help='Usage: Specify to save a file with results. Invalid in case of camera input.')
-parser.add_argument('--vis', '-v', action='store_true',
-                    help='Usage: Specify to open a new window to show results. Invalid in case of camera input.')
+parser.add_argument('--save', '-s', action='store_true', default=False,
+                    help='Usage: Specify to save a file with results.')
+parser.add_argument('--vis', '-v', action='store_true', default=True,
+                    help='Usage: Specify to open a new window to show results.')
 args = parser.parse_args()
-
 def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1):
     output = image.copy()
     h, w, _ = output.shape
@@ -80,16 +79,21 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex
         print('No frames grabbed!')
         exit()
     first_frame_copy = first_frame.copy()
-    cv.putText(first_frame_copy, "1. Drag a bounding box to track.", (0, 15), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
-    cv.putText(first_frame_copy, "2. Press ENTER to confirm", (0, 35), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
-    roi = cv.selectROI('vitTrack Demo', first_frame_copy)
+    cv.putText(first_frame_copy, "1. Drag a bounding box to track.", (0, 25), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
+    cv.putText(first_frame_copy, "2. Press ENTER to confirm", (0, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
+    roi = cv.selectROI('VitTrack Demo', first_frame_copy)
 
     if np.all(np.array(roi) == 0):
-        print("No roi is selected! Exiting ...")
+        print("No ROI is selected! Exiting ...")
         exit()
     else:
         print("Selected ROI: {}".format(roi))
 
+    if args.save:
+        fps = video.get(cv.CAP_PROP_FPS)
+        frame_size = (first_frame.shape[1], first_frame.shape[0])
+        output_video = cv.VideoWriter('output.mp4', cv.VideoWriter_fourcc(*'mp4v'), fps, frame_size)
+
     # Init tracker with ROI
     model.init(first_frame, roi)
 
@@ -106,5 +110,15 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex
         tm.stop()
         # Visualize
         frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS())
-        cv.imshow('VitTrack Demo', frame)
+        if args.save:
+            output_video.write(frame)
+
+        if args.vis:
+            cv.imshow('VitTrack Demo', frame)
         tm.reset()
+
+    if args.save:
+        output_video.release()
+
+    video.release()
+    cv.destroyAllWindows()