Merge pull request #35 from line/model_prediction_test

Add test for inference APIs
line · Sep 18, 2024 · 9062b7a · 9062b7a
2 parents caa7125 + b073107
commit 9062b7a
Show file tree

Hide file tree

Showing 9 changed files with 113 additions and 9 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -19,12 +19,22 @@ jobs:
         with:
           python-version: 3.9
 
+      - name: Run ffmpeg
+        run: |
+          sudo apt-get update
+          sudo apt-get upgrade
+          sudo apt-get install -y ffmpeg
+
       - name: Run dependency libraries
         run: |
           pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 torchtext==0.15.1
           pip install easydict pandas tqdm pyyaml scikit-learn ffmpeg-python ftfy regex einops fvcore gradio torchlibrosa librosa
           pip install 'clip@git+https://github.com/openai/CLIP.git'
+          pip install 'git+https://github.com/line/lighthouse.git'
           pip install pytest
+          pip uninstall -y numpy
+          pip install numpy==1.23.5
+
 
       - name: Run pytest
         run: pytest tests/test_models.py
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,8 @@ gradio_demo/weights/*.ckpt
 highlight_*.png
 SLOWFAST_8x8_R50.pkl
 Cnn14_mAP=0.431.pth
+tests/test_videos/video_duration_*
+tests/weights/*.ckpt
 
 # Mac
 .DS_Store

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ It supports seven models, four features (video and audio features), and six data
 We will release v1.0 until the end of September. Our plan includes:
 - [x] : Reduce the configuration files (issue #19)
 - [ ] : Update the trained weights and feature files on Google Drive and Zenodo
-- [ ] : Introduce PyTest for inference API (issue #21)
+- [x] : Introduce PyTest for inference API (issue #21)
 - [x] : Introduce Linter for inference API (issue #20)
 
 ## Installation

diff --git a/lighthouse/feature_extractor/vision_encoder.py b/lighthouse/feature_extractor/vision_encoder.py
@@ -87,13 +87,18 @@ def _select_visual_encoders(self) -> List[Any]:
                                                           model_path_dict[self._feature_name])]
         return visual_encoders
 
+    def _trim_shorter_length(self, visual_features):
+        min_length = min([x.shape[0] for x in visual_features])
+        trimmed_visual_features = [x[:min_length] for x in visual_features]
+        return trimmed_visual_features
+
     def encode(
         self,
         input_path: str) -> Tuple[torch.Tensor, torch.Tensor]:
         assert len(self._frame_loaders) == len(self._visual_encoders), 'the number of frame_loaders and visual_encoders is different.'
         frame_inputs = [loader(input_path) for loader in self._frame_loaders]
         assert not any([item is None for item in frame_inputs]), 'one of the loaders return None object.'
         visual_features = [encoder(frames) for encoder, frames in zip(self._visual_encoders, frame_inputs)]
-        concat_features = torch.concat(visual_features, dim=-1)
+        concat_features = torch.concat(self._trim_shorter_length(visual_features), dim=-1)
         visual_mask = torch.ones(1, len(concat_features)).to(self._device)
         return concat_features, visual_mask
diff --git a/lighthouse/frame_loaders/slowfast_loader.py b/lighthouse/frame_loaders/slowfast_loader.py
@@ -102,11 +102,11 @@ def _pad_frames(self, tensor, value=0):
         if n == self._target_fps:
             return tensor
         if self._padding_mode == "constant":
-            z = torch.ones(n, tensor.shape[1], tensor.shape[2], tensor.shape[3], dtype=torch.uint8)
+            z = torch.ones(int(n), tensor.shape[1], tensor.shape[2], tensor.shape[3], dtype=torch.uint8)
             z *= value
             return torch.cat((tensor, z), 0)
         elif self._padding_mode == "tile":
-            z = torch.cat(n * [tensor[-1:, :, :, :]])
+            z = torch.cat(int(n) * [tensor[-1:, :, :, :]])
             return torch.cat((tensor, z), 0)
         else:
             raise NotImplementedError(

diff --git a/lighthouse/models.py b/lighthouse/models.py
@@ -198,10 +198,10 @@ def _post_processing(
         pred_spans = torch.clamp(span_cxw_to_xx(pred_spans) * video_duration, min=0, max=video_duration)
         cur_ranked_preds = torch.cat([pred_spans, scores[:, None]], dim=1).tolist()
         cur_ranked_preds = sorted(cur_ranked_preds, key=lambda x: x[2], reverse=True)
-        cur_ranked_preds = [[float(f"{e:.4f}") for e in row] for row in cur_ranked_preds][:self._moment_num]
+        cur_ranked_preds = [[float(f"{e:.4f}") for e in row] for row in cur_ranked_preds]
         saliency_scores = outputs["saliency_scores"][inputs["src_vid_mask"] == 1].cpu().tolist()
-
-        return cur_ranked_preds, saliency_scores
+        
+        return cur_ranked_preds[:self._moment_num], saliency_scores
 
     def _encode_audio(
         self,

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -1,3 +1,90 @@
+import os
+import math
+import pytest
+import subprocess
+from lighthouse.models import (MomentDETRPredictor, QDDETRPredictor, EaTRPredictor, 
+                               CGDETRPredictor, UVCOMPredictor)
 
-def test():
-    return True
+
+FEATURES = ['clip', 'clip_slowfast']
+MODELS = ['moment_detr', 'qd_detr', 'eatr', 'cg_detr', 'uvcom']
+DATASETS = ['qvhighlight']
+MIN_DURATION = 10
+MAX_DURATION = 151
+MOMENT_NUM = 10
+
+
+@pytest.mark.dependency()
+def test_generate_multiple_duration_videos():
+    durations = [i for i in range(MIN_DURATION, MAX_DURATION)]
+    return_codes = []
+    for duration in durations:
+        cmd = f'ffmpeg -y -i api_example/RoripwjYFp8_60.0_210.0.mp4 -t {duration} -c copy tests/test_videos/video_duration_{duration}.mp4'
+        result = subprocess.run(cmd, shell=True)
+        return_codes.append(result.returncode)
+    for return_code in return_codes:
+        assert return_code == 0, '[ffmpeg conversion] return_code should be set 0.'
+
+@pytest.mark.dependency()
+def test_save_model_weights():
+    return_codes = []
+    for feature in FEATURES:
+        for model in MODELS:
+            for dataset in DATASETS:
+                if not os.path.exists(f'tests/weights/{feature}_{model}_{dataset}.ckpt'):
+                    cmd = f'wget -P tests/weights/ https://zenodo.org/records/13363606/files/{feature}_{model}_{dataset}.ckpt'
+                    result = subprocess.run(cmd, shell=True)
+                    return_codes.append(result.returncode)
+    for return_code in return_codes:
+        assert return_code == 0, '[save model weights] return_code should be set 0.'
+
+@pytest.mark.dependency()
+def test_load_slowfast_pann_weights():
+    if not os.path.exists('tests/SLOWFAST_8x8_R50.pkl'):
+        result = subprocess.run('wget -P tests/ https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWFAST_8x8_R50.pkl', shell=True)
+        assert result.returncode == 0, '[Save slowfast weights] return_code should be set 0.'
+
+    if not os.path.exists('tests/Cnn14_mAP=0.431.pth'):
+        result = subprocess.run('wget -P tests/ https://zenodo.org/record/3987831/files/Cnn14_mAP%3D0.431.pth', shell=True)
+        assert result.returncode == 0, '[Save PANNs weights] return_code should be set 0.'
+
+@pytest.mark.dependency(depends=['test_generate_multiple_duration_videos', 
+                                 'test_save_model_weights', 
+                                 'test_load_slowfast_pann_weights'])
+def test_model_prediction():
+    """
+    Test all of the trained models, except for resnet_glove features and taskweave
+    Untested features:
+        - ResNet+GloVe is skipped due to their low performance.
+        - CLIP+Slowfast+PANNs is skipped due to their low latency.
+    
+    Untested models:
+        - TR-DETR is skipped because model use .cuda() function. We need to remove it.
+        - Taskweave is skiped because two strategies are neccesary for prediction.
+    """
+    model_loaders  = {
+        'moment_detr': MomentDETRPredictor,
+        'qd_detr': QDDETRPredictor,
+        'eatr': EaTRPredictor,
+        'cg_detr': CGDETRPredictor,
+        'uvcom': UVCOMPredictor,
+    }
+
+    for feature in FEATURES:
+        for model_name in MODELS:
+            for dataset in DATASETS:
+                model_weight = os.path.join('tests/weights/', f'{feature}_{model_name}_{dataset}.ckpt')
+                model = model_loaders[model_name](model_weight, device='cpu', feature_name=feature, 
+                                                slowfast_path='tests/SLOWFAST_8x8_R50.pkl', 
+                                                pann_path='tests/Cnn14_mAP=0.431.pth')
+
+                # test model on 10s to 150s
+                for second in range(MIN_DURATION, MAX_DURATION):
+                    video_path = f'tests/test_videos/video_duration_{second}.mp4'
+                    model.encode_video(video_path)
+                    query = 'A woman wearing a glass is speaking in front of the camera'
+                    prediction = model.predict(query)
+                    assert len(prediction['pred_relevant_windows']) == MOMENT_NUM, \
+                        f'The number of moments from {feature}_{model_name}_{dataset} is expected {MOMENT_NUM}, but got {len(prediction["pred_relevant_windows"])}.'
+                    assert len(prediction['pred_saliency_scores']) == math.ceil(second / model._clip_len), \
+                        f'The number of saliency scores from {feature}_{model_name}_{dataset} is expected {math.ceil(second / model._clip_len)}, but got {len(prediction["pred_saliency_scores"])}.'
diff --git a/tests/test_videos/.gitkeep b/tests/test_videos/.gitkeep
diff --git a/tests/weights/.gitkeep b/tests/weights/.gitkeep