Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C++ Demo - Object Tracking (VitTrack) #240

Merged
merged 8 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions models/object_tracking_vittrack/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
cmake_minimum_required(VERSION 3.24)
set(project_name "opencv_zoo_object_tracking_vittrack")

PROJECT (${project_name})

set(OPENCV_VERSION "4.9.0")
set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH})
# Find OpenCV, you may need to set OpenCV_DIR variable
# to the absolute path to the directory containing OpenCVConfig.cmake file
# via the command line or GUI

file(GLOB SourceFile
"demo.cpp")
# If the package has been found, several variables will
# be set, you can find the full list with descriptions
# in the OpenCVConfig.cmake file.
# Print some message showing some of them
message(STATUS "OpenCV library status:")
message(STATUS " config: ${OpenCV_DIR}")
message(STATUS " version: ${OpenCV_VERSION}")
message(STATUS " libraries: ${OpenCV_LIBS}")
message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}")

# Declare the executable target built from your sources
add_executable(${project_name} ${SourceFile})

# Set C++ compilation standard to C++11
set(CMAKE_CXX_STANDARD 11)

# Link your application with OpenCV libraries
target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
22 changes: 21 additions & 1 deletion models/object_tracking_vittrack/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,34 @@ This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC
**NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**

# Demo

## Python
```bash
# tracking on camera input
python demo.py

# tracking on video
python demo.py --input /path/to/video

# get help regarding various parameters
python demo.py --help
```
## C++
Install latest OpenCV and CMake >= 3.24.0 to get started.

```shell
# A typical and default installation path of OpenCV is /usr/local
cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation .
cmake --build build

# tracking on camera input
./build/opencv_zoo_object_tracking_vittrack

# tracking on video
./build/opencv_zoo_object_tracking_vittrack -i=/path/to/video

# get help messages
./build/opencv_zoo_object_tracking_vittrack -h
```

# Example outputs

Expand Down
210 changes: 210 additions & 0 deletions models/object_tracking_vittrack/demo.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#include <iostream>
#include <opencv2/opencv.hpp>

using namespace std;
using namespace cv;
using namespace dnn;

struct TrackingResult
{
bool isLocated;
Rect bbox;
float score;
};

class VitTrack
{
public:

VitTrack(const string& model_path, int backend_id = 0, int target_id = 0)
{
params.net = model_path;
params.backend = backend_id;
params.target = target_id;
model = TrackerVit::create(params);
}

void init(const Mat& image, const Rect& roi)
{
model->init(image, roi);
}

TrackingResult infer(const Mat& image)
{
TrackingResult result;
result.isLocated = model->update(image, result.bbox);
result.score = model->getTrackingScore();
return result;
}

private:
TrackerVit::Params params;
Ptr<TrackerVit> model;
};

Mat visualize(const Mat& image, const Rect& bbox, float score, bool isLocated, double fps = -1.0,
const Scalar& box_color = Scalar(0, 255, 0), const Scalar& text_color = Scalar(0, 255, 0),
double fontScale = 1.0, int fontSize = 1)
{
Mat output = image.clone();
int h = output.rows;
int w = output.cols;

if (fps >= 0)
{
putText(output, "FPS: " + to_string(fps), Point(0, 30), FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize);
}

if (isLocated && score >= 0.3)
{
rectangle(output, bbox, box_color, 2);
putText(output, format("%.2f", score), Point(bbox.x, bbox.y + 25),
FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize);
}
else
{
Size text_size = getTextSize("Target lost!", FONT_HERSHEY_DUPLEX, fontScale, fontSize, nullptr);
int text_x = (w - text_size.width) / 2;
int text_y = (h - text_size.height) / 2;
putText(output, "Target lost!", Point(text_x, text_y), FONT_HERSHEY_DUPLEX, fontScale, Scalar(0, 0, 255), fontSize);
}

return output;
}

int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv,
"{help h | | Print help message. }"
"{input i | |Set path to the input video. Omit for using default camera.}"
"{model_path |object_tracking_vittrack_2023sep.onnx |Set model path}"
"{backend_target bt |0 |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}"
"{save s |false |Specify to save a file with results.}"
"{vis v |true |Specify to open a new window to show results.}");
if (parser.has("help"))
{
parser.printMessage();
return 0;
}

string input = parser.get<string>("input");
string model_path = parser.get<string>("model_path");
int backend_target = parser.get<int>("backend_target");
bool save = parser.get<bool>("save");
bool vis = parser.get<bool>("vis");

vector<vector<int>> backend_target_pairs =
{
{DNN_BACKEND_OPENCV, DNN_TARGET_CPU},
{DNN_BACKEND_CUDA, DNN_TARGET_CUDA},
{DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},
{DNN_BACKEND_TIMVX, DNN_TARGET_NPU},
{DNN_BACKEND_CANN, DNN_TARGET_NPU}
};

int backend_id = backend_target_pairs[backend_target][0];
int target_id = backend_target_pairs[backend_target][1];

// Create VitTrack tracker
VitTrack tracker(model_path, backend_id, target_id);

// Open video capture
VideoCapture video;
if (input.empty())
{
video.open(0); // Default camera
}
else
{
video.open(input);
}

if (!video.isOpened())
{
cerr << "Error: Could not open video source" << endl;
return -1;
}

// Select an object
Mat first_frame;
video >> first_frame;

if (first_frame.empty())
{
cerr << "No frames grabbed!" << endl;
return -1;
}

Mat first_frame_copy = first_frame.clone();
putText(first_frame_copy, "1. Drag a bounding box to track.", Point(0, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0));
putText(first_frame_copy, "2. Press ENTER to confirm", Point(0, 50), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0));
Rect roi = selectROI("VitTrack Demo", first_frame_copy);

if (roi.area() == 0)
{
cerr << "No ROI is selected! Exiting..." << endl;
return -1;
}
else
{
cout << "Selected ROI: " << roi << endl;
}

// Create VideoWriter if save option is specified
VideoWriter output_video;
if (save)
{
Size frame_size = first_frame.size();
output_video.open("output.mp4", VideoWriter::fourcc('m', 'p', '4', 'v'), video.get(CAP_PROP_FPS), frame_size);
if (!output_video.isOpened())
{
cerr << "Error: Could not create output video stream" << endl;
return -1;
}
}

// Initialize tracker with ROI
tracker.init(first_frame, roi);

// Track frame by frame
TickMeter tm;
while (waitKey(1) < 0)
{
video >> first_frame;
if (first_frame.empty())
{
cout << "End of video" << endl;
break;
}

// Inference
tm.start();
TrackingResult result = tracker.infer(first_frame);
tm.stop();

// Visualize
Mat frame = first_frame.clone();
frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS());

if (save)
{
output_video.write(frame);
}

if (vis)
{
imshow("VitTrack Demo", frame);
}
tm.reset();
}

if (save)
{
output_video.release();
}

video.release();
destroyAllWindows();

return 0;
}
34 changes: 24 additions & 10 deletions models/object_tracking_vittrack/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,11 @@
{:d}: TIM-VX + NPU,
{:d}: CANN + NPU
'''.format(*[x for x in range(len(backend_target_pairs))]))
parser.add_argument('--save', '-s', action='store_true',
help='Usage: Specify to save a file with results. Invalid in case of camera input.')
parser.add_argument('--vis', '-v', action='store_true',
help='Usage: Specify to open a new window to show results. Invalid in case of camera input.')
parser.add_argument('--save', '-s', action='store_true', default=False,
help='Usage: Specify to save a file with results.')
parser.add_argument('--vis', '-v', action='store_true', default=True,
help='Usage: Specify to open a new window to show results.')
args = parser.parse_args()

def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1):
output = image.copy()
h, w, _ = output.shape
Expand Down Expand Up @@ -80,16 +79,21 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex
print('No frames grabbed!')
exit()
first_frame_copy = first_frame.copy()
cv.putText(first_frame_copy, "1. Drag a bounding box to track.", (0, 15), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
cv.putText(first_frame_copy, "2. Press ENTER to confirm", (0, 35), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
roi = cv.selectROI('vitTrack Demo', first_frame_copy)
cv.putText(first_frame_copy, "1. Drag a bounding box to track.", (0, 25), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
cv.putText(first_frame_copy, "2. Press ENTER to confirm", (0, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
roi = cv.selectROI('VitTrack Demo', first_frame_copy)

if np.all(np.array(roi) == 0):
print("No roi is selected! Exiting ...")
print("No ROI is selected! Exiting ...")
exit()
else:
print("Selected ROI: {}".format(roi))

if args.save:
fps = video.get(cv.CAP_PROP_FPS)
frame_size = (first_frame.shape[1], first_frame.shape[0])
output_video = cv.VideoWriter('output.mp4', cv.VideoWriter_fourcc(*'mp4v'), fps, frame_size)

# Init tracker with ROI
model.init(first_frame, roi)

Expand All @@ -106,5 +110,15 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex
tm.stop()
# Visualize
frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS())
cv.imshow('VitTrack Demo', frame)
if args.save:
output_video.write(frame)

if args.vis:
cv.imshow('VitTrack Demo', frame)
tm.reset()

if args.save:
output_video.release()

video.release()
cv.destroyAllWindows()