diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index fa1b1fe..38a2970 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -19,12 +19,22 @@ jobs: with: python-version: 3.9 + - name: Run ffmpeg + run: | + sudo apt-get update + sudo apt-get upgrade + sudo apt-get install -y ffmpeg + - name: Run dependency libraries run: | pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 torchtext==0.15.1 pip install easydict pandas tqdm pyyaml scikit-learn ffmpeg-python ftfy regex einops fvcore gradio torchlibrosa librosa pip install 'clip@git+https://github.com/openai/CLIP.git' + pip install 'git+https://github.com/line/lighthouse.git' pip install pytest + pip uninstall -y numpy + pip install numpy==1.23.5 + - name: Run pytest run: pytest tests/test_models.py \ No newline at end of file diff --git a/.gitignore b/.gitignore index fe9a6f7..733ed7d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ gradio_demo/weights/*.ckpt highlight_*.png SLOWFAST_8x8_R50.pkl Cnn14_mAP=0.431.pth +tests/test_videos/video_duration_* +tests/weights/*.ckpt # Mac .DS_Store diff --git a/README.md b/README.md index 377efba..f7f931d 100755 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ It supports seven models, four features (video and audio features), and six data We will release v1.0 until the end of September. Our plan includes: - [x] : Reduce the configuration files (issue #19) - [ ] : Update the trained weights and feature files on Google Drive and Zenodo -- [ ] : Introduce PyTest for inference API (issue #21) +- [x] : Introduce PyTest for inference API (issue #21) - [x] : Introduce Linter for inference API (issue #20) ## Installation diff --git a/lighthouse/feature_extractor/vision_encoder.py b/lighthouse/feature_extractor/vision_encoder.py index 2215b58..1dde4fc 100644 --- a/lighthouse/feature_extractor/vision_encoder.py +++ b/lighthouse/feature_extractor/vision_encoder.py @@ -87,6 +87,11 @@ def _select_visual_encoders(self) -> List[Any]: model_path_dict[self._feature_name])] return visual_encoders + def _trim_shorter_length(self, visual_features): + min_length = min([x.shape[0] for x in visual_features]) + trimmed_visual_features = [x[:min_length] for x in visual_features] + return trimmed_visual_features + def encode( self, input_path: str) -> Tuple[torch.Tensor, torch.Tensor]: @@ -94,6 +99,6 @@ def encode( frame_inputs = [loader(input_path) for loader in self._frame_loaders] assert not any([item is None for item in frame_inputs]), 'one of the loaders return None object.' visual_features = [encoder(frames) for encoder, frames in zip(self._visual_encoders, frame_inputs)] - concat_features = torch.concat(visual_features, dim=-1) + concat_features = torch.concat(self._trim_shorter_length(visual_features), dim=-1) visual_mask = torch.ones(1, len(concat_features)).to(self._device) return concat_features, visual_mask \ No newline at end of file diff --git a/lighthouse/frame_loaders/slowfast_loader.py b/lighthouse/frame_loaders/slowfast_loader.py index c01da6b..8ae55b3 100644 --- a/lighthouse/frame_loaders/slowfast_loader.py +++ b/lighthouse/frame_loaders/slowfast_loader.py @@ -102,11 +102,11 @@ def _pad_frames(self, tensor, value=0): if n == self._target_fps: return tensor if self._padding_mode == "constant": - z = torch.ones(n, tensor.shape[1], tensor.shape[2], tensor.shape[3], dtype=torch.uint8) + z = torch.ones(int(n), tensor.shape[1], tensor.shape[2], tensor.shape[3], dtype=torch.uint8) z *= value return torch.cat((tensor, z), 0) elif self._padding_mode == "tile": - z = torch.cat(n * [tensor[-1:, :, :, :]]) + z = torch.cat(int(n) * [tensor[-1:, :, :, :]]) return torch.cat((tensor, z), 0) else: raise NotImplementedError( diff --git a/lighthouse/models.py b/lighthouse/models.py index 7a1def6..56cf637 100644 --- a/lighthouse/models.py +++ b/lighthouse/models.py @@ -198,10 +198,10 @@ def _post_processing( pred_spans = torch.clamp(span_cxw_to_xx(pred_spans) * video_duration, min=0, max=video_duration) cur_ranked_preds = torch.cat([pred_spans, scores[:, None]], dim=1).tolist() cur_ranked_preds = sorted(cur_ranked_preds, key=lambda x: x[2], reverse=True) - cur_ranked_preds = [[float(f"{e:.4f}") for e in row] for row in cur_ranked_preds][:self._moment_num] + cur_ranked_preds = [[float(f"{e:.4f}") for e in row] for row in cur_ranked_preds] saliency_scores = outputs["saliency_scores"][inputs["src_vid_mask"] == 1].cpu().tolist() - - return cur_ranked_preds, saliency_scores + + return cur_ranked_preds[:self._moment_num], saliency_scores def _encode_audio( self, diff --git a/tests/test_models.py b/tests/test_models.py index 190a13c..796a05f 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,3 +1,90 @@ +import os +import math +import pytest +import subprocess +from lighthouse.models import (MomentDETRPredictor, QDDETRPredictor, EaTRPredictor, + CGDETRPredictor, UVCOMPredictor) -def test(): - return True \ No newline at end of file + +FEATURES = ['clip', 'clip_slowfast'] +MODELS = ['moment_detr', 'qd_detr', 'eatr', 'cg_detr', 'uvcom'] +DATASETS = ['qvhighlight'] +MIN_DURATION = 10 +MAX_DURATION = 151 +MOMENT_NUM = 10 + + +@pytest.mark.dependency() +def test_generate_multiple_duration_videos(): + durations = [i for i in range(MIN_DURATION, MAX_DURATION)] + return_codes = [] + for duration in durations: + cmd = f'ffmpeg -y -i api_example/RoripwjYFp8_60.0_210.0.mp4 -t {duration} -c copy tests/test_videos/video_duration_{duration}.mp4' + result = subprocess.run(cmd, shell=True) + return_codes.append(result.returncode) + for return_code in return_codes: + assert return_code == 0, '[ffmpeg conversion] return_code should be set 0.' + +@pytest.mark.dependency() +def test_save_model_weights(): + return_codes = [] + for feature in FEATURES: + for model in MODELS: + for dataset in DATASETS: + if not os.path.exists(f'tests/weights/{feature}_{model}_{dataset}.ckpt'): + cmd = f'wget -P tests/weights/ https://zenodo.org/records/13363606/files/{feature}_{model}_{dataset}.ckpt' + result = subprocess.run(cmd, shell=True) + return_codes.append(result.returncode) + for return_code in return_codes: + assert return_code == 0, '[save model weights] return_code should be set 0.' + +@pytest.mark.dependency() +def test_load_slowfast_pann_weights(): + if not os.path.exists('tests/SLOWFAST_8x8_R50.pkl'): + result = subprocess.run('wget -P tests/ https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWFAST_8x8_R50.pkl', shell=True) + assert result.returncode == 0, '[Save slowfast weights] return_code should be set 0.' + + if not os.path.exists('tests/Cnn14_mAP=0.431.pth'): + result = subprocess.run('wget -P tests/ https://zenodo.org/record/3987831/files/Cnn14_mAP%3D0.431.pth', shell=True) + assert result.returncode == 0, '[Save PANNs weights] return_code should be set 0.' + +@pytest.mark.dependency(depends=['test_generate_multiple_duration_videos', + 'test_save_model_weights', + 'test_load_slowfast_pann_weights']) +def test_model_prediction(): + """ + Test all of the trained models, except for resnet_glove features and taskweave + Untested features: + - ResNet+GloVe is skipped due to their low performance. + - CLIP+Slowfast+PANNs is skipped due to their low latency. + + Untested models: + - TR-DETR is skipped because model use .cuda() function. We need to remove it. + - Taskweave is skiped because two strategies are neccesary for prediction. + """ + model_loaders = { + 'moment_detr': MomentDETRPredictor, + 'qd_detr': QDDETRPredictor, + 'eatr': EaTRPredictor, + 'cg_detr': CGDETRPredictor, + 'uvcom': UVCOMPredictor, + } + + for feature in FEATURES: + for model_name in MODELS: + for dataset in DATASETS: + model_weight = os.path.join('tests/weights/', f'{feature}_{model_name}_{dataset}.ckpt') + model = model_loaders[model_name](model_weight, device='cpu', feature_name=feature, + slowfast_path='tests/SLOWFAST_8x8_R50.pkl', + pann_path='tests/Cnn14_mAP=0.431.pth') + + # test model on 10s to 150s + for second in range(MIN_DURATION, MAX_DURATION): + video_path = f'tests/test_videos/video_duration_{second}.mp4' + model.encode_video(video_path) + query = 'A woman wearing a glass is speaking in front of the camera' + prediction = model.predict(query) + assert len(prediction['pred_relevant_windows']) == MOMENT_NUM, \ + f'The number of moments from {feature}_{model_name}_{dataset} is expected {MOMENT_NUM}, but got {len(prediction["pred_relevant_windows"])}.' + assert len(prediction['pred_saliency_scores']) == math.ceil(second / model._clip_len), \ + f'The number of saliency scores from {feature}_{model_name}_{dataset} is expected {math.ceil(second / model._clip_len)}, but got {len(prediction["pred_saliency_scores"])}.' \ No newline at end of file diff --git a/tests/test_videos/.gitkeep b/tests/test_videos/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/weights/.gitkeep b/tests/weights/.gitkeep new file mode 100644 index 0000000..e69de29