From 5bb290002d9fdb5d9e9c779ffc27baef8bf2cd67 Mon Sep 17 00:00:00 2001 From: vidit Date: Sun, 20 Mar 2022 05:57:00 +0530 Subject: [PATCH 1/2] Fixed mmaction folder to make Swin-T working and made the installation process much faster and easier --- docs/install.md | 37 +++-- mmaction/apis/inference.py | 18 +-- mmaction/apis/train.py | 62 ++------ mmaction/core/__init__.py | 2 +- mmaction/core/bbox/transforms.py | 32 ++--- .../evaluation/ava_evaluation/np_box_list.py | 7 +- .../object_detection_evaluation.py | 92 +++++++----- .../ava_evaluation/per_image_evaluation.py | 136 +++++++++++------- mmaction/core/evaluation/ava_utils.py | 103 ++++++------- mmaction/core/evaluation/eval_hooks.py | 5 +- mmaction/datasets/__init__.py | 4 +- mmaction/datasets/audio_visual_dataset.py | 2 +- mmaction/datasets/base.py | 2 +- mmaction/datasets/blending_utils.py | 28 ++-- mmaction/datasets/pipelines/__init__.py | 4 +- mmaction/datasets/pipelines/augmentations.py | 62 ++------ mmaction/datasets/pipelines/loading.py | 76 ++++------ mmaction/datasets/pipelines/pose_loading.py | 8 +- mmaction/datasets/ssn_dataset.py | 2 +- mmaction/models/__init__.py | 6 +- mmaction/models/backbones/__init__.py | 4 +- mmaction/models/backbones/mobilenet_v2.py | 8 +- mmaction/models/backbones/resnet3d.py | 22 ++- mmaction/models/backbones/resnet3d_csn.py | 2 +- .../models/backbones/resnet3d_slowfast.py | 2 +- mmaction/models/backbones/resnet_audio.py | 8 +- mmaction/models/backbones/tanet.py | 36 ++--- mmaction/models/common/lfb.py | 2 +- mmaction/models/heads/bbox_head.py | 11 +- mmaction/models/heads/lfb_infer_head.py | 2 +- mmaction/models/heads/misc_head.py | 2 +- mmaction/models/recognizers/__init__.py | 4 +- mmaction/models/recognizers/base.py | 10 +- mmaction/models/recognizers/recognizer2d.py | 4 +- mmaction/utils/__init__.py | 4 +- mmaction/utils/decorators.py | 1 + mmaction/utils/precise_bn.py | 5 +- 37 files changed, 369 insertions(+), 446 deletions(-) diff --git a/docs/install.md b/docs/install.md index 9ba611d3bb..9d77303d2d 100644 --- a/docs/install.md +++ b/docs/install.md @@ -48,8 +48,25 @@ conda install -y jpeg libtiff If mmcv and mmcv-full are both installed, there will be `ModuleNotFoundError`. ## Prepare environment - -a. Create a conda virtual environment and activate it. +- __*Method 1*__: + - This will create the necessary conda environment + ```shell + conda env create -f environment.yml + conda activate swint + ``` + based upon the path of python you can check this using + ```shell + which python + ``` + for example, it's ```/home/vidit/miniconda3/bin/python``` + then run + ```shell + cp -r mmaction/ /home/vidit/miniconda3/lib/python3.7/site-packages/mmaction/ + ``` + #### i.e. just copy the path before ```bin``` in the path of python and add ```/lib/python3.7/site-packages/mmaction/``` to it. +- __*Method 2*__: + + a. Create a conda virtual environment and activate it. ```shell conda create -n open-mmlab python=3.7 -y @@ -158,17 +175,17 @@ Note: 1. The git commit id will be written to the version number with step b, e.g. 0.6.0+2e7045c. The version will also be saved in trained models. It is recommended that you run step b each time you pull some updates from github. If C++/CUDA codes are modified, then this step is compulsory. -2. Following the above instructions, MMAction2 is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it (unless you submit some commits and want to update the version number). + 1. Following the above instructions, MMAction2 is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it (unless you submit some commits and want to update the version number). -3. If you would like to use `opencv-python-headless` instead of `opencv-python`, - you can install it before installing MMCV. + 2. If you would like to use `opencv-python-headless` instead of `opencv-python`, + you can install it before installing MMCV. -4. If you would like to use `PyAV`, you can install it with `conda install av -c conda-forge -y`. + 3. If you would like to use `PyAV`, you can install it with `conda install av -c conda-forge -y`. -5. Some dependencies are optional. Running `python setup.py develop` will only install the minimum runtime requirements. - To use optional dependencies like `decord`, either install them with `pip install -r requirements/optional.txt` - or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`, - valid keys for the `[optional]` field are `all`, `tests`, `build`, and `optional`) like `pip install -v -e .[tests,build]`. + 4. Some dependencies are optional. Running `python setup.py develop` will only install the minimum runtime requirements. + To use optional dependencies like `decord`, either install them with `pip install -r requirements/optional.txt` + or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`, + valid keys for the `[optional]` field are `all`, `tests`, `build`, and `optional`) like `pip install -v -e .[tests,build]`. ## Install with CPU only diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py index e31685b7f5..008e4e3d31 100644 --- a/mmaction/apis/inference.py +++ b/mmaction/apis/inference.py @@ -1,6 +1,5 @@ import os import os.path as osp -import re from operator import itemgetter import mmcv @@ -107,23 +106,10 @@ def inference_recognizer(model, filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg') modality = cfg.data.test.get('modality', 'RGB') start_index = cfg.data.test.get('start_index', 1) - - # count the number of frames that match the format of `filename_tmpl` - # RGB pattern example: img_{:05}.jpg -> ^img_\d+.jpg$ - # Flow patteren example: {}_{:05d}.jpg -> ^x_\d+.jpg$ - pattern = f'^{filename_tmpl}$' - if modality == 'Flow': - pattern = pattern.replace('{}', 'x') - pattern = pattern.replace( - pattern[pattern.find('{'):pattern.find('}') + 1], '\\d+') - total_frames = len( - list( - filter(lambda x: re.match(pattern, x) is not None, - os.listdir(video_path)))) - data = dict( frame_dir=video_path, - total_frames=total_frames, + total_frames=len(os.listdir(video_path)), + # assuming files in ``video_path`` are all named with ``filename_tmpl`` # noqa: E501 label=-1, start_index=start_index, filename_tmpl=filename_tmpl, diff --git a/mmaction/apis/train.py b/mmaction/apis/train.py index 89e89ced98..ef3c1eec54 100644 --- a/mmaction/apis/train.py +++ b/mmaction/apis/train.py @@ -1,20 +1,15 @@ import copy as cp import os.path as osp - import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook, build_optimizer, get_dist_info) from mmcv.runner.hooks import Fp16OptimizerHook - from ..core import (DistEvalHook, EvalHook, OmniSourceDistSamplerSeedHook, OmniSourceRunner) from ..datasets import build_dataloader, build_dataset from ..utils import PreciseBNHook, get_root_logger from .test import multi_gpu_test -from mmcv_custom.runner import EpochBasedRunnerAmp -import apex -import os.path as osp def train_model(model, @@ -47,10 +42,8 @@ def train_model(model, # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] - if 'optimizer_config' not in cfg: - cfg.optimizer_config={} dataloader_setting = dict( - videos_per_gpu=cfg.data.get('videos_per_gpu', 1) // cfg.optimizer_config.get('update_interval', 1), + videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), num_gpus=len(cfg.gpu_ids), dist=distributed, @@ -80,24 +73,6 @@ def train_model(model, build_dataloader(ds, **dataloader_setting) for ds in dataset ] - # build runner - optimizer = build_optimizer(model, cfg.optimizer) - # use apex fp16 optimizer - # Noticed that this is just a temporary patch. We shoud not encourage this kind of code style - use_amp = False - if ( - cfg.optimizer_config.get("type", None) - and cfg.optimizer_config["type"] == "DistOptimizerHook" - ): - if cfg.optimizer_config.get("use_fp16", False): - model, optimizer = apex.amp.initialize( - model.cuda(), optimizer, opt_level="O1" - ) - for m in model.modules(): - if hasattr(m, "fp16_enabled"): - m.fp16_enabled = True - use_amp = True - # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) @@ -112,23 +87,16 @@ def train_model(model, model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) - if use_amp: - Runner = EpochBasedRunnerAmp - runner = Runner( - model, - optimizer=optimizer, - work_dir=cfg.work_dir, - logger=logger, - meta=meta, - amp=use_amp) - else: - Runner = OmniSourceRunner if cfg.omnisource else EpochBasedRunner - runner = Runner( - model, - optimizer=optimizer, - work_dir=cfg.work_dir, - logger=logger, - meta=meta) + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + + Runner = OmniSourceRunner if cfg.omnisource else EpochBasedRunner + runner = Runner( + model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp @@ -184,9 +152,7 @@ def train_model(model, runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: - runner.resume(cfg.resume_from, resume_amp=use_amp) - elif cfg.get("auto_resume", False) and osp.exists(osp.join(runner.work_dir, 'latest.pth')): - runner.auto_resume() + runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner_kwargs = dict() @@ -257,5 +223,5 @@ def train_model(model, eval_res = test_dataset.evaluate(outputs, **eval_cfg) runner.logger.info(f'Testing results of the {name} checkpoint') - for metric_name, val in eval_res.items(): - runner.logger.info(f'{metric_name}: {val:.04f}') + for name, val in eval_res.items(): + runner.logger.info(f'{name}: {val:.04f}') diff --git a/mmaction/core/__init__.py b/mmaction/core/__init__.py index f5f617cdf2..6842299583 100644 --- a/mmaction/core/__init__.py +++ b/mmaction/core/__init__.py @@ -1,6 +1,6 @@ from .bbox import * # noqa: F401, F403 from .evaluation import * # noqa: F401, F403 from .hooks import * # noqa: F401, F403 +from .lr import * # noqa: F401, F403 from .optimizer import * # noqa: F401, F403 from .runner import * # noqa: F401, F403 -from .scheduler import * # noqa: F401, F403 diff --git a/mmaction/core/bbox/transforms.py b/mmaction/core/bbox/transforms.py index b051e2275e..d61e8116d7 100644 --- a/mmaction/core/bbox/transforms.py +++ b/mmaction/core/bbox/transforms.py @@ -15,22 +15,22 @@ def bbox2result(bboxes, labels, num_classes, thr=0.01): """ if bboxes.shape[0] == 0: return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32)) + else: + bboxes = bboxes.cpu().numpy() + labels = labels.cpu().numpy() - bboxes = bboxes.cpu().numpy() - labels = labels.cpu().numpy() + # We only handle multilabel now + assert labels.shape[-1] > 1 - # We only handle multilabel now - assert labels.shape[-1] > 1 + scores = labels # rename for clarification + thr = (thr, ) * num_classes if isinstance(thr, float) else thr + assert scores.shape[1] == num_classes + assert len(thr) == num_classes - scores = labels # rename for clarification - thr = (thr, ) * num_classes if isinstance(thr, float) else thr - assert scores.shape[1] == num_classes - assert len(thr) == num_classes - - result = [] - for i in range(num_classes - 1): - where = scores[:, i + 1] > thr[i + 1] - result.append( - np.concatenate((bboxes[where, :4], scores[where, i + 1:i + 2]), - axis=1)) - return result + result = [] + for i in range(num_classes - 1): + where = scores[:, i + 1] > thr[i + 1] + result.append( + np.concatenate((bboxes[where, :4], scores[where, i + 1:i + 2]), + axis=1)) + return result diff --git a/mmaction/core/evaluation/ava_evaluation/np_box_list.py b/mmaction/core/evaluation/ava_evaluation/np_box_list.py index ddfdd5184d..f9b101e6f5 100644 --- a/mmaction/core/evaluation/ava_evaluation/np_box_list.py +++ b/mmaction/core/evaluation/ava_evaluation/np_box_list.py @@ -120,9 +120,8 @@ def get_coordinates(self): x_max = box_coordinates[:, 3] return [y_min, x_min, y_max, x_max] - @staticmethod - def _is_valid_boxes(data): - """Check whether data fulfills the format of N*[ymin, xmin, ymax, + def _is_valid_boxes(self, data): + """Check whether data fullfills the format of N*[ymin, xmin, ymax, xmin]. Args: @@ -132,7 +131,7 @@ def _is_valid_boxes(data): a boolean indicating whether all ymax of boxes are equal or greater than ymin, and all xmax of boxes are equal or greater than xmin. """ - if len(data) != 0: + if len(data): for v in data: if v[0] > v[2] or v[1] > v[3]: return False diff --git a/mmaction/core/evaluation/ava_evaluation/object_detection_evaluation.py b/mmaction/core/evaluation/ava_evaluation/object_detection_evaluation.py index 508a076def..95f0cc501c 100644 --- a/mmaction/core/evaluation/ava_evaluation/object_detection_evaluation.py +++ b/mmaction/core/evaluation/ava_evaluation/object_detection_evaluation.py @@ -29,7 +29,6 @@ import collections import logging -import warnings from abc import ABCMeta, abstractmethod from collections import defaultdict @@ -102,13 +101,15 @@ def clear(self): class ObjectDetectionEvaluator(DetectionEvaluator): """A class to evaluate detections.""" - def __init__(self, - categories, - matching_iou_threshold=0.5, - evaluate_corlocs=False, - metric_prefix=None, - use_weighted_mean_ap=False, - evaluate_masks=False): + def __init__( + self, + categories, + matching_iou_threshold=0.5, + evaluate_corlocs=False, + metric_prefix=None, + use_weighted_mean_ap=False, + evaluate_masks=False, + ): """Constructor. Args: @@ -243,8 +244,7 @@ def add_single_detected_image_info(self, image_id, detections_dict): detected_masks=detection_masks, ) - @staticmethod - def create_category_index(categories): + def create_category_index(self, categories): """Creates dictionary of COCO compatible categories keyed by category id. @@ -277,8 +277,14 @@ def evaluate(self): 2. per_category_ap: category specific results with keys of the form 'PerformanceByCategory/mAP@IOU/category' """ - (per_class_ap, mean_ap, _, _, per_class_corloc, - mean_corloc) = self._evaluation.evaluate() + ( + per_class_ap, + mean_ap, + _, + _, + per_class_corloc, + mean_corloc, + ) = self._evaluation.evaluate() metric = f'mAP@{self._matching_iou_threshold}IOU' pascal_metrics = {self._metric_prefix + metric: mean_ap} @@ -349,13 +355,15 @@ def __init__(self, categories, matching_iou_threshold=0.5): class ObjectDetectionEvaluation: """Internal implementation of Pascal object detection metrics.""" - def __init__(self, - num_groundtruth_classes, - matching_iou_threshold=0.5, - nms_iou_threshold=1.0, - nms_max_output_boxes=10000, - use_weighted_mean_ap=False, - label_id_offset=0): + def __init__( + self, + num_groundtruth_classes, + matching_iou_threshold=0.5, + nms_iou_threshold=1.0, + nms_max_output_boxes=10000, + use_weighted_mean_ap=False, + label_id_offset=0, + ): if num_groundtruth_classes < 1: raise ValueError( 'Need at least 1 groundtruth class for evaluation.') @@ -391,11 +399,13 @@ def _initialize_detections(self): def clear_detections(self): self._initialize_detections() - def add_single_ground_truth_image_info(self, - image_key, - groundtruth_boxes, - groundtruth_class_labels, - groundtruth_masks=None): + def add_single_ground_truth_image_info( + self, + image_key, + groundtruth_boxes, + groundtruth_class_labels, + groundtruth_masks=None, + ): """Adds groundtruth for a single image to be used for evaluation. Args: @@ -410,8 +420,8 @@ def add_single_ground_truth_image_info(self, masks. The mask values range from 0 to 1. """ if image_key in self.groundtruth_boxes: - warnings.warn(('image %s has already been added to the ground ' - 'truth database.'), image_key) + logging.warn(('image %s has already been added to the ground ' + 'truth database.'), image_key) return self.groundtruth_boxes[image_key] = groundtruth_boxes @@ -420,12 +430,14 @@ def add_single_ground_truth_image_info(self, self._update_ground_truth_statistics(groundtruth_class_labels) - def add_single_detected_image_info(self, - image_key, - detected_boxes, - detected_scores, - detected_class_labels, - detected_masks=None): + def add_single_detected_image_info( + self, + image_key, + detected_boxes, + detected_scores, + detected_class_labels, + detected_masks=None, + ): """Adds detections for a single image to be used for evaluation. Args: @@ -456,8 +468,8 @@ def add_single_detected_image_info(self, ) if image_key in self.detection_keys: - warnings.warn(('image %s has already been added to the ground ' - 'truth database.'), image_key) + logging.warn(('image %s has already been added to the ground ' + 'truth database.'), image_key) return self.detection_keys.add(image_key) @@ -524,7 +536,8 @@ def evaluate(self): logging.info( 'The following classes have no ground truth examples: %s', np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) + - self.label_id_offset) + self.label_id_offset, + ) if self.use_weighted_mean_ap: all_scores = np.array([], dtype=float) @@ -544,8 +557,10 @@ def evaluate(self): all_scores = np.append(all_scores, scores) all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels) precision, recall = metrics.compute_precision_recall( - scores, tp_fp_labels, - self.num_gt_instances_per_class[class_index]) + scores, + tp_fp_labels, + self.num_gt_instances_per_class[class_index], + ) self.precisions_per_class.append(precision) self.recalls_per_class.append(recall) average_precision = metrics.compute_average_precision( @@ -554,7 +569,8 @@ def evaluate(self): self.corloc_per_class = metrics.compute_cor_loc( self.num_gt_imgs_per_class, - self.num_images_correctly_detected_per_class) + self.num_images_correctly_detected_per_class, + ) if self.use_weighted_mean_ap: num_gt_instances = np.sum(self.num_gt_instances_per_class) diff --git a/mmaction/core/evaluation/ava_evaluation/per_image_evaluation.py b/mmaction/core/evaluation/ava_evaluation/per_image_evaluation.py index 2d06672d89..6265c17d7a 100644 --- a/mmaction/core/evaluation/ava_evaluation/per_image_evaluation.py +++ b/mmaction/core/evaluation/ava_evaluation/per_image_evaluation.py @@ -40,14 +40,16 @@ def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5): self.matching_iou_threshold = matching_iou_threshold self.num_groundtruth_classes = num_groundtruth_classes - def compute_object_detection_metrics(self, - detected_boxes, - detected_scores, - detected_class_labels, - groundtruth_boxes, - groundtruth_class_labels, - detected_masks=None, - groundtruth_masks=None): + def compute_object_detection_metrics( + self, + detected_boxes, + detected_scores, + detected_class_labels, + groundtruth_boxes, + groundtruth_class_labels, + detected_masks=None, + groundtruth_masks=None, + ): """Evaluates detections as being tp, fp or ignored from a single image. The evaluation is done in two stages: @@ -103,14 +105,16 @@ def compute_object_detection_metrics(self, return scores, tp_fp_labels - def _compute_tp_fp(self, - detected_boxes, - detected_scores, - detected_class_labels, - groundtruth_boxes, - groundtruth_class_labels, - detected_masks=None, - groundtruth_masks=None): + def _compute_tp_fp( + self, + detected_boxes, + detected_scores, + detected_class_labels, + groundtruth_boxes, + groundtruth_class_labels, + detected_masks=None, + groundtruth_masks=None, + ): """Labels true/false positives of detections of an image across all classes. @@ -155,12 +159,18 @@ def _compute_tp_fp(self, result_scores = [] result_tp_fp_labels = [] for i in range(self.num_groundtruth_classes): - (gt_boxes_at_ith_class, gt_masks_at_ith_class, - detected_boxes_at_ith_class, detected_scores_at_ith_class, - detected_masks_at_ith_class) = self._get_ith_class_arrays( - detected_boxes, detected_scores, detected_masks, - detected_class_labels, groundtruth_boxes, groundtruth_masks, - groundtruth_class_labels, i) + ( + gt_boxes_at_ith_class, + gt_masks_at_ith_class, + detected_boxes_at_ith_class, + detected_scores_at_ith_class, + detected_masks_at_ith_class, + ) = self._get_ith_class_arrays(detected_boxes, detected_scores, + detected_masks, + detected_class_labels, + groundtruth_boxes, + groundtruth_masks, + groundtruth_class_labels, i) scores, tp_fp_labels = self._compute_tp_fp_for_single_class( detected_boxes=detected_boxes_at_ith_class, detected_scores=detected_scores_at_ith_class, @@ -172,9 +182,8 @@ def _compute_tp_fp(self, result_tp_fp_labels.append(tp_fp_labels) return result_scores, result_tp_fp_labels - @staticmethod - def _get_overlaps_and_scores_box_mode(detected_boxes, detected_scores, - groundtruth_boxes): + def _get_overlaps_and_scores_box_mode(self, detected_boxes, + detected_scores, groundtruth_boxes): """Computes overlaps and scores between detected and groudntruth boxes. Args: @@ -205,12 +214,14 @@ def _get_overlaps_and_scores_box_mode(detected_boxes, detected_scores, num_boxes = detected_boxlist.num_boxes() return iou, None, scores, num_boxes - def _compute_tp_fp_for_single_class(self, - detected_boxes, - detected_scores, - groundtruth_boxes, - detected_masks=None, - groundtruth_masks=None): + def _compute_tp_fp_for_single_class( + self, + detected_boxes, + detected_scores, + groundtruth_boxes, + detected_masks=None, + groundtruth_masks=None, + ): """Labels boxes detected with the same class from the same image as tp/fp. @@ -238,11 +249,15 @@ def _compute_tp_fp_for_single_class(self, if detected_boxes.size == 0: return np.array([], dtype=float), np.array([], dtype=bool) - (iou, _, scores, - num_detected_boxes) = self._get_overlaps_and_scores_box_mode( - detected_boxes=detected_boxes, - detected_scores=detected_scores, - groundtruth_boxes=groundtruth_boxes) + ( + iou, + _, + scores, + num_detected_boxes, + ) = self._get_overlaps_and_scores_box_mode( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + groundtruth_boxes=groundtruth_boxes) if groundtruth_boxes.size == 0: return scores, np.zeros(num_detected_boxes, dtype=bool) @@ -267,11 +282,17 @@ def _compute_tp_fp_for_single_class(self, return scores, tp_fp_labels - @staticmethod - def _get_ith_class_arrays(detected_boxes, detected_scores, detected_masks, - detected_class_labels, groundtruth_boxes, - groundtruth_masks, groundtruth_class_labels, - class_index): + def _get_ith_class_arrays( + self, + detected_boxes, + detected_scores, + detected_masks, + detected_class_labels, + groundtruth_boxes, + groundtruth_masks, + groundtruth_class_labels, + class_index, + ): """Returns numpy arrays belonging to class with index `class_index`. Args: @@ -311,15 +332,21 @@ class labels. detected_masks_at_ith_class = detected_masks[selected_detections] else: detected_masks_at_ith_class = None - return (gt_boxes_at_ith_class, gt_masks_at_ith_class, - detected_boxes_at_ith_class, detected_scores_at_ith_class, - detected_masks_at_ith_class) - - @staticmethod - def _remove_invalid_boxes(detected_boxes, - detected_scores, - detected_class_labels, - detected_masks=None): + return ( + gt_boxes_at_ith_class, + gt_masks_at_ith_class, + detected_boxes_at_ith_class, + detected_scores_at_ith_class, + detected_masks_at_ith_class, + ) + + def _remove_invalid_boxes( + self, + detected_boxes, + detected_scores, + detected_class_labels, + detected_masks=None, + ): """Removes entries with invalid boxes. A box is invalid if either its xmax is smaller than its xmin, or its @@ -346,13 +373,16 @@ def _remove_invalid_boxes(detected_boxes, """ valid_indices = np.logical_and( detected_boxes[:, 0] < detected_boxes[:, 2], - detected_boxes[:, 1] < detected_boxes[:, 3]) + detected_boxes[:, 1] < detected_boxes[:, 3], + ) detected_boxes = detected_boxes[valid_indices] detected_scores = detected_scores[valid_indices] detected_class_labels = detected_class_labels[valid_indices] if detected_masks is not None: detected_masks = detected_masks[valid_indices] return [ - detected_boxes, detected_scores, detected_class_labels, - detected_masks + detected_boxes, + detected_scores, + detected_class_labels, + detected_masks, ] diff --git a/mmaction/core/evaluation/ava_utils.py b/mmaction/core/evaluation/ava_utils.py index 159297fb7d..01036b85f9 100644 --- a/mmaction/core/evaluation/ava_utils.py +++ b/mmaction/core/evaluation/ava_utils.py @@ -35,14 +35,14 @@ def results2csv(dataset, results, out_file, custom_classes=None): csv_results = det2csv(dataset, results, custom_classes) # save space for float - def to_str(item): + def tostr(item): if isinstance(item, float): return f'{item:.3f}' return str(item) with open(out_file, 'w') as f: for csv_result in csv_results: - f.write(','.join(map(to_str, csv_result))) + f.write(','.join(map(lambda x: tostr(x), csv_result))) f.write('\n') @@ -157,6 +157,7 @@ def ava_eval(result_file, label_file, ann_file, exclude_file, + max_dets=(100, ), verbose=True, custom_classes=None): @@ -185,52 +186,52 @@ def ava_eval(result_file, if verbose: print_time('Reading detection results', start) - # Evaluation for mAP - pascal_evaluator = det_eval.PascalDetectionEvaluator(categories) - - start = time.time() - for image_key in gt_boxes: - if verbose and image_key in excluded_keys: - logging.info( - 'Found excluded timestamp in detections: %s.' - 'It will be ignored.', image_key) - continue - pascal_evaluator.add_single_ground_truth_image_info( - image_key, { - standard_fields.InputDataFields.groundtruth_boxes: - np.array(gt_boxes[image_key], dtype=float), - standard_fields.InputDataFields.groundtruth_classes: - np.array(gt_labels[image_key], dtype=int) - }) - if verbose: - print_time('Convert groundtruth', start) - - start = time.time() - for image_key in boxes: - if verbose and image_key in excluded_keys: - logging.info( - 'Found excluded timestamp in detections: %s.' - 'It will be ignored.', image_key) - continue - pascal_evaluator.add_single_detected_image_info( - image_key, { - standard_fields.DetectionResultFields.detection_boxes: - np.array(boxes[image_key], dtype=float), - standard_fields.DetectionResultFields.detection_classes: - np.array(labels[image_key], dtype=int), - standard_fields.DetectionResultFields.detection_scores: - np.array(scores[image_key], dtype=float) - }) - if verbose: - print_time('convert detections', start) - - start = time.time() - metrics = pascal_evaluator.evaluate() - if verbose: - print_time('run_evaluator', start) - for display_name in metrics: - print(f'{display_name}=\t{metrics[display_name]}') - return { - display_name: metrics[display_name] - for display_name in metrics if 'ByCategory' not in display_name - } + if result_type == 'mAP': + pascal_evaluator = det_eval.PascalDetectionEvaluator(categories) + + start = time.time() + for image_key in gt_boxes: + if verbose and image_key in excluded_keys: + logging.info( + 'Found excluded timestamp in detections: %s.' + 'It will be ignored.', image_key) + continue + pascal_evaluator.add_single_ground_truth_image_info( + image_key, { + standard_fields.InputDataFields.groundtruth_boxes: + np.array(gt_boxes[image_key], dtype=float), + standard_fields.InputDataFields.groundtruth_classes: + np.array(gt_labels[image_key], dtype=int) + }) + if verbose: + print_time('Convert groundtruth', start) + + start = time.time() + for image_key in boxes: + if verbose and image_key in excluded_keys: + logging.info( + 'Found excluded timestamp in detections: %s.' + 'It will be ignored.', image_key) + continue + pascal_evaluator.add_single_detected_image_info( + image_key, { + standard_fields.DetectionResultFields.detection_boxes: + np.array(boxes[image_key], dtype=float), + standard_fields.DetectionResultFields.detection_classes: + np.array(labels[image_key], dtype=int), + standard_fields.DetectionResultFields.detection_scores: + np.array(scores[image_key], dtype=float) + }) + if verbose: + print_time('convert detections', start) + + start = time.time() + metrics = pascal_evaluator.evaluate() + if verbose: + print_time('run_evaluator', start) + for display_name in metrics: + print(f'{display_name}=\t{metrics[display_name]}') + return { + display_name: metrics[display_name] + for display_name in metrics if 'ByCategory' not in display_name + } diff --git a/mmaction/core/evaluation/eval_hooks.py b/mmaction/core/evaluation/eval_hooks.py index 9ef5a8ad34..d96ad87a6b 100644 --- a/mmaction/core/evaluation/eval_hooks.py +++ b/mmaction/core/evaluation/eval_hooks.py @@ -4,6 +4,7 @@ from math import inf import torch.distributed as dist +from mmcv.runner import Hook from torch.nn.modules.batchnorm import _BatchNorm from torch.utils.data import DataLoader @@ -39,8 +40,6 @@ def __init__(self, *args, save_best='auto', **kwargs): if not from_mmcv: - from mmcv.runner import Hook - class EvalHook(Hook): # noqa: F811 """Non-Distributed evaluation hook. @@ -363,7 +362,7 @@ def _do_evaluate(self, runner): # of rank 0 to other ranks to avoid this. if self.broadcast_bn_buffer: model = runner.model - for _, module in model.named_modules(): + for name, module in model.named_modules(): if isinstance(module, _BatchNorm) and module.track_running_stats: dist.broadcast(module.running_var, 0) diff --git a/mmaction/datasets/__init__.py b/mmaction/datasets/__init__.py index c7e23fe252..c4b10e53e5 100644 --- a/mmaction/datasets/__init__.py +++ b/mmaction/datasets/__init__.py @@ -5,7 +5,7 @@ from .ava_dataset import AVADataset from .base import BaseDataset from .blending_utils import (BaseMiniBatchBlending, CutmixBlending, - MixupBlending, LabelSmoothing) + MixupBlending) from .builder import (BLENDINGS, DATASETS, PIPELINES, build_dataloader, build_dataset) from .dataset_wrappers import RepeatDataset @@ -22,6 +22,6 @@ 'RawframeDataset', 'BaseDataset', 'ActivityNetDataset', 'SSNDataset', 'HVUDataset', 'AudioDataset', 'AudioFeatureDataset', 'ImageDataset', 'RawVideoDataset', 'AVADataset', 'AudioVisualDataset', - 'BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'LabelSmoothing', 'DATASETS', + 'BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'DATASETS', 'PIPELINES', 'BLENDINGS', 'PoseDataset' ] diff --git a/mmaction/datasets/audio_visual_dataset.py b/mmaction/datasets/audio_visual_dataset.py index e3d5fabfbf..6e10b4b040 100644 --- a/mmaction/datasets/audio_visual_dataset.py +++ b/mmaction/datasets/audio_visual_dataset.py @@ -65,7 +65,7 @@ def load_annotations(self): idx += 1 # idx for label[s] label = [int(x) for x in line_split[idx:]] - assert len(label) != 0, f'missing label in line: {line}' + assert len(label), f'missing label in line: {line}' if self.multi_class: assert self.num_classes is not None video_info['label'] = label diff --git a/mmaction/datasets/base.py b/mmaction/datasets/base.py index e4f753388c..62fe34f214 100644 --- a/mmaction/datasets/base.py +++ b/mmaction/datasets/base.py @@ -90,7 +90,7 @@ def __init__(self, self.video_infos_by_class = self.parse_by_class() class_prob = [] - for _, samples in self.video_infos_by_class.items(): + for k, samples in self.video_infos_by_class.items(): class_prob.append(len(samples) / len(self.video_infos)) class_prob = [x**self.power for x in class_prob] diff --git a/mmaction/datasets/blending_utils.py b/mmaction/datasets/blending_utils.py index 64fdcf7eec..8ef35b0e73 100644 --- a/mmaction/datasets/blending_utils.py +++ b/mmaction/datasets/blending_utils.py @@ -6,20 +6,14 @@ from .builder import BLENDINGS -__all__ = ['BaseMiniBatchBlending', 'MixupBlending', 'CutmixBlending', 'LabelSmoothing'] - -def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'): - x = x.long().view(-1, 1) - return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value) +__all__ = ['BaseMiniBatchBlending', 'MixupBlending', 'CutmixBlending'] class BaseMiniBatchBlending(metaclass=ABCMeta): """Base class for Image Aliasing.""" - def __init__(self, num_classes, smoothing=0.): + def __init__(self, num_classes): self.num_classes = num_classes - self.off_value = smoothing / self.num_classes - self.on_value = 1. - smoothing + self.off_value @abstractmethod def do_blending(self, imgs, label, **kwargs): @@ -53,7 +47,7 @@ def __call__(self, imgs, label, **kwargs): the shape of (B, 1, num_classes) and all elements are in range [0, 1]. """ - one_hot_label = one_hot(label, num_classes=self.num_classes, on_value=self.on_value, off_value=self.off_value, device=label.device) + one_hot_label = F.one_hot(label, num_classes=self.num_classes) mixed_imgs, mixed_label = self.do_blending(imgs, one_hot_label, **kwargs) @@ -74,8 +68,8 @@ class MixupBlending(BaseMiniBatchBlending): alpha (float): Parameters for Beta distribution. """ - def __init__(self, num_classes, alpha=.2, smoothing=0.): - super().__init__(num_classes=num_classes, smoothing=smoothing) + def __init__(self, num_classes, alpha=.2): + super().__init__(num_classes=num_classes) self.beta = Beta(alpha, alpha) def do_blending(self, imgs, label, **kwargs): @@ -95,16 +89,18 @@ def do_blending(self, imgs, label, **kwargs): @BLENDINGS.register_module() class CutmixBlending(BaseMiniBatchBlending): """Implementing Cutmix in a mini-batch. + This module is proposed in `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features `_. Code Reference https://github.com/clovaai/CutMix-PyTorch + Args: num_classes (int): The number of classes. alpha (float): Parameters for Beta distribution. """ - def __init__(self, num_classes, alpha=.2, smoothing=0.): - super().__init__(num_classes=num_classes, smoothing=smoothing) + def __init__(self, num_classes, alpha=.2): + super().__init__(num_classes=num_classes) self.beta = Beta(alpha, alpha) @staticmethod @@ -144,9 +140,3 @@ def do_blending(self, imgs, label, **kwargs): label = lam * label + (1 - lam) * label[rand_index, :] return imgs, label - - -@BLENDINGS.register_module() -class LabelSmoothing(BaseMiniBatchBlending): - def do_blending(self, imgs, label, **kwargs): - return imgs, label diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py index ee71544de3..8a15583ce9 100644 --- a/mmaction/datasets/pipelines/__init__.py +++ b/mmaction/datasets/pipelines/__init__.py @@ -1,7 +1,7 @@ from .augmentations import (AudioAmplify, CenterCrop, ColorJitter, EntityBoxCrop, EntityBoxFlip, EntityBoxRescale, Flip, Fuse, Imgaug, MelSpectrogram, MultiGroupCrop, - MultiScaleCrop, Normalize, RandomCrop, RandomErasing, + MultiScaleCrop, Normalize, RandomCrop, RandomRescale, RandomResizedCrop, RandomScale, Resize, TenCrop, ThreeCrop) from .compose import Compose @@ -21,7 +21,7 @@ __all__ = [ 'SampleFrames', 'PyAVDecode', 'DecordDecode', 'DenseSampleFrames', - 'OpenCVDecode', 'FrameSelector', 'MultiGroupCrop', 'MultiScaleCrop', 'RandomErasing', + 'OpenCVDecode', 'FrameSelector', 'MultiGroupCrop', 'MultiScaleCrop', 'RandomResizedCrop', 'RandomCrop', 'Resize', 'Flip', 'Fuse', 'Normalize', 'ThreeCrop', 'CenterCrop', 'TenCrop', 'ImageToTensor', 'Transpose', 'Collect', 'FormatShape', 'Compose', 'ToTensor', 'ToDataContainer', diff --git a/mmaction/datasets/pipelines/augmentations.py b/mmaction/datasets/pipelines/augmentations.py index 94f0a896ef..839fb115aa 100644 --- a/mmaction/datasets/pipelines/augmentations.py +++ b/mmaction/datasets/pipelines/augmentations.py @@ -5,8 +5,6 @@ import mmcv import numpy as np from torch.nn.modules.utils import _pair -import timm.data as tdata -import torch from ..builder import PIPELINES @@ -276,8 +274,7 @@ def __init__(self, transforms): self.aug = iaa.Sequential( [self.imgaug_builder(t) for t in self.transforms]) - @staticmethod - def default_transforms(): + def default_transforms(self): """Default transforms for imgaug. Implement RandAugment by imgaug. @@ -330,8 +327,8 @@ def default_transforms(): type='Cutout', nb_iterations=1, size=0.2 * cur_level, - squared=True) - ]) + squared=True), + ]), ] def imgaug_builder(self, cfg): @@ -424,41 +421,6 @@ def __call__(self, results): return results -@PIPELINES.register_module() -class RandomErasing(tdata.random_erasing.RandomErasing): - def __init__(self, device='cpu', **args): - super().__init__(device=device, **args) - - def __call__(self, results): - in_type = results['imgs'][0].dtype.type - - rand_state = random.getstate() - torchrand_state = torch.get_rng_state() - numpyrand_state = np.random.get_state() - # not using cuda to preserve the determiness - - out_frame = [] - for frame in results['imgs']: - random.setstate(rand_state) - torch.set_rng_state(torchrand_state) - np.random.set_state(numpyrand_state) - frame = super().__call__(torch.from_numpy(frame).permute(2, 0, 1)).permute(1, 2, 0).numpy() - out_frame.append(frame) - - results['imgs'] = out_frame - img_h, img_w, _ = results['imgs'][0].shape - - out_type = results['imgs'][0].dtype.type - assert in_type == out_type, \ - ('Timmaug input dtype and output dtype are not the same. ', - f'Convert from {in_type} to {out_type}') - - if 'gt_bboxes' in results: - raise NotImplementedError('only support recognition now') - assert results['img_shape'] == (img_h, img_w) - - return results - @PIPELINES.register_module() class Fuse: @@ -592,17 +554,14 @@ def __init__(self, size, lazy=False): self.size = size self.lazy = lazy - @staticmethod - def _crop_kps(kps, crop_bbox): + def _crop_kps(self, kps, crop_bbox): return kps - crop_bbox[:2] - @staticmethod - def _crop_imgs(imgs, crop_bbox): + def _crop_imgs(self, imgs, crop_bbox): x1, y1, x2, y2 = crop_bbox return [img[y1:y2, x1:x2] for img in imgs] - @staticmethod - def _box_crop(box, crop_bbox): + def _box_crop(self, box, crop_bbox): """Crop the bounding boxes according to the crop_bbox. Args: @@ -1110,12 +1069,10 @@ def _resize_imgs(self, imgs, new_w, new_h): for img in imgs ] - @staticmethod - def _resize_kps(kps, scale_factor): + def _resize_kps(self, kps, scale_factor): return kps * scale_factor - @staticmethod - def _box_resize(box, scale_factor): + def _box_resize(self, box, scale_factor): """Rescale the bounding boxes according to the scale_factor. Args: @@ -1307,8 +1264,7 @@ def _flip_kps(self, kps, kpscores, img_width): kpscores = kpscores[:, :, new_order] return kps, kpscores - @staticmethod - def _box_flip(box, img_width): + def _box_flip(self, box, img_width): """Flip the bounding boxes given the width of the image. Args: diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 9e03cc6fe2..69e1d38e56 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -12,7 +12,6 @@ from ...utils import get_random_string, get_shm_dir, get_thread_id from ..builder import PIPELINES -import random @PIPELINES.register_module() @@ -110,8 +109,7 @@ def __init__(self, twice_sample=False, out_of_bound_opt='loop', test_mode=False, - start_index=None, - frame_uniform=False): + start_index=None): self.clip_len = clip_len self.frame_interval = frame_interval @@ -120,7 +118,6 @@ def __init__(self, self.twice_sample = twice_sample self.out_of_bound_opt = out_of_bound_opt self.test_mode = test_mode - self.frame_uniform = frame_uniform assert self.out_of_bound_opt in ['loop', 'repeat_last'] if start_index is not None: @@ -202,27 +199,6 @@ def _sample_clips(self, num_frames): return clip_offsets - def get_seq_frames(self, num_frames): - """ - Modified from https://github.com/facebookresearch/SlowFast/blob/64abcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159 - Given the video index, return the list of sampled frame indexes. - Args: - num_frames (int): Total number of frame in the video. - Returns: - seq (list): the indexes of frames of sampled from the video. - """ - seg_size = float(num_frames - 1) / self.clip_len - seq = [] - for i in range(self.clip_len): - start = int(np.round(seg_size * i)) - end = int(np.round(seg_size * (i + 1))) - if not self.test_mode: - seq.append(random.randint(start, end)) - else: - seq.append((start + end) // 2) - - return np.array(seq) - def __call__(self, results): """Perform the SampleFrames loading. @@ -231,35 +207,31 @@ def __call__(self, results): to the next transform in pipeline. """ total_frames = results['total_frames'] - if self.frame_uniform: # sthv2 sampling strategy - assert results['start_index'] == 0 - frame_inds = self.get_seq_frames(total_frames) - else: - clip_offsets = self._sample_clips(total_frames) - frame_inds = clip_offsets[:, None] + np.arange( - self.clip_len)[None, :] * self.frame_interval - frame_inds = np.concatenate(frame_inds) - - if self.temporal_jitter: - perframe_offsets = np.random.randint( - self.frame_interval, size=len(frame_inds)) - frame_inds += perframe_offsets - - frame_inds = frame_inds.reshape((-1, self.clip_len)) - if self.out_of_bound_opt == 'loop': - frame_inds = np.mod(frame_inds, total_frames) - elif self.out_of_bound_opt == 'repeat_last': - safe_inds = frame_inds < total_frames - unsafe_inds = 1 - safe_inds - last_ind = np.max(safe_inds * frame_inds, axis=1) - new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T) - frame_inds = new_inds - else: - raise ValueError('Illegal out_of_bound option.') - start_index = results['start_index'] - frame_inds = np.concatenate(frame_inds) + start_index + clip_offsets = self._sample_clips(total_frames) + frame_inds = clip_offsets[:, None] + np.arange( + self.clip_len)[None, :] * self.frame_interval + frame_inds = np.concatenate(frame_inds) + + if self.temporal_jitter: + perframe_offsets = np.random.randint( + self.frame_interval, size=len(frame_inds)) + frame_inds += perframe_offsets + + frame_inds = frame_inds.reshape((-1, self.clip_len)) + if self.out_of_bound_opt == 'loop': + frame_inds = np.mod(frame_inds, total_frames) + elif self.out_of_bound_opt == 'repeat_last': + safe_inds = frame_inds < total_frames + unsafe_inds = 1 - safe_inds + last_ind = np.max(safe_inds * frame_inds, axis=1) + new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T) + frame_inds = new_inds + else: + raise ValueError('Illegal out_of_bound option.') + start_index = results['start_index'] + frame_inds = np.concatenate(frame_inds) + start_index results['frame_inds'] = frame_inds.astype(np.int) results['clip_len'] = self.clip_len results['frame_interval'] = self.frame_interval diff --git a/mmaction/datasets/pipelines/pose_loading.py b/mmaction/datasets/pipelines/pose_loading.py index ae198d42ed..9c19e25427 100644 --- a/mmaction/datasets/pipelines/pose_loading.py +++ b/mmaction/datasets/pipelines/pose_loading.py @@ -146,8 +146,7 @@ class PoseDecode: applicable). """ - @staticmethod - def _load_kp(kp, frame_inds): + def _load_kp(self, kp, frame_inds): """Load keypoints given frame indices. Args: @@ -157,8 +156,7 @@ def _load_kp(kp, frame_inds): return [x[frame_inds].astype(np.float32) for x in kp] - @staticmethod - def _load_kpscore(kpscore, frame_inds): + def _load_kpscore(self, kpscore, frame_inds): """Load keypoint scores given frame indices. Args: @@ -191,7 +189,7 @@ def __call__(self, results): return results def __repr__(self): - repr_str = f'{self.__class__.__name__}()' + repr_str = (f'{self.__class__.__name__}()') return repr_str diff --git a/mmaction/datasets/ssn_dataset.py b/mmaction/datasets/ssn_dataset.py index 8a7f1dd0d2..76d24324df 100644 --- a/mmaction/datasets/ssn_dataset.py +++ b/mmaction/datasets/ssn_dataset.py @@ -767,7 +767,7 @@ def prepare_train_frames(self, idx): out_proposal_labels = [] out_proposal_reg_targets = [] - for _, proposal in enumerate(results['out_proposals']): + for idx, proposal in enumerate(results['out_proposals']): # proposal: [(video_id, SSNInstance), proposal_type] num_frames = proposal[0][1].num_video_frames diff --git a/mmaction/models/__init__.py b/mmaction/models/__init__.py index d612642376..aec46b3c53 100644 --- a/mmaction/models/__init__.py +++ b/mmaction/models/__init__.py @@ -15,13 +15,13 @@ CrossEntropyLoss, HVULoss, NLLLoss, OHEMHingeLoss, SSNLoss) from .necks import TPN -from .recognizers import (AudioRecognizer, BaseRecognizer, Recognizer2D, - Recognizer3D) +from .recognizers import (AudioRecognizer, BaseRecognizer, recognizer2d, + recognizer3d, swintransformer3d) from .roi_extractors import SingleRoIExtractor3D __all__ = [ 'BACKBONES', 'HEADS', 'RECOGNIZERS', 'build_recognizer', 'build_head', - 'build_backbone', 'Recognizer2D', 'Recognizer3D', 'C3D', 'ResNet', + 'build_backbone', 'recognizer2d', 'recognizer3d', 'swintransformer3d', 'C3D', 'ResNet', 'ResNet3d', 'ResNet2Plus1d', 'I3DHead', 'TSNHead', 'TSMHead', 'BaseHead', 'BaseRecognizer', 'LOSSES', 'CrossEntropyLoss', 'NLLLoss', 'HVULoss', 'ResNetTSM', 'ResNet3dSlowFast', 'SlowFastHead', 'Conv2plus1d', diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py index ef1174d9c1..4999d12ba0 100644 --- a/mmaction/models/backbones/__init__.py +++ b/mmaction/models/backbones/__init__.py @@ -12,11 +12,9 @@ from .resnet_tsm import ResNetTSM from .tanet import TANet from .x3d import X3D -from .swin_transformer import SwinTransformer3D - __all__ = [ 'C3D', 'ResNet', 'ResNet3d', 'ResNetTSM', 'ResNet2Plus1d', 'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNet3dCSN', 'ResNetTIN', 'X3D', - 'ResNetAudio', 'ResNet3dLayer', 'MobileNetV2TSM', 'MobileNetV2', 'TANet', 'SwinTransformer3D' + 'ResNetAudio', 'ResNet3dLayer', 'MobileNetV2TSM', 'MobileNetV2', 'TANet' ] diff --git a/mmaction/models/backbones/mobilenet_v2.py b/mmaction/models/backbones/mobilenet_v2.py index 5dce73502b..5a093fa1fa 100644 --- a/mmaction/models/backbones/mobilenet_v2.py +++ b/mmaction/models/backbones/mobilenet_v2.py @@ -107,8 +107,8 @@ def forward(self, x): def _inner_forward(x): if self.use_res_connect: return x + self.conv(x) - - return self.conv(x) + else: + return self.conv(x) if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) @@ -275,8 +275,8 @@ def forward(self, x): if len(outs) == 1: return outs[0] - - return tuple(outs) + else: + return tuple(outs) def _freeze_stages(self): if self.frozen_stages >= 0: diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py index 79f98e0f7a..83a64801f7 100644 --- a/mmaction/models/backbones/resnet3d.py +++ b/mmaction/models/backbones/resnet3d.py @@ -345,15 +345,11 @@ class ResNet3d(nn.Module): dilations (Sequence[int]): Dilation of each stage. Default: ``(1, 1, 1, 1)``. conv1_kernel (Sequence[int]): Kernel size of the first conv layer. - Default: ``(3, 7, 7)``. - conv1_stride_s (int): Spatial stride of the first conv layer. - Default: 2. + Default: ``(5, 7, 7)``. conv1_stride_t (int): Temporal stride of the first conv layer. - Default: 1. - pool1_stride_s (int): Spatial stride of the first pooling layer. Default: 2. pool1_stride_t (int): Temporal stride of the first pooling layer. - Default: 1. + Default: 2. with_pool2 (bool): Whether to use pool2. Default: True. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is @@ -362,7 +358,7 @@ class ResNet3d(nn.Module): not freezing any parameters. Default: -1. inflate (Sequence[int]): Inflate Dims of each block. Default: (1, 1, 1, 1). - inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the + inflate_style (str): ``3x1x1`` or ``1x1x1``. which determines the kernel sizes and padding strides for conv1 and conv2 in each block. Default: '3x1x1'. conv_cfg (dict): Config for conv layers. required keys are ``type`` @@ -405,11 +401,11 @@ def __init__(self, spatial_strides=(1, 2, 2, 2), temporal_strides=(1, 1, 1, 1), dilations=(1, 1, 1, 1), - conv1_kernel=(3, 7, 7), + conv1_kernel=(5, 7, 7), conv1_stride_s=2, - conv1_stride_t=1, + conv1_stride_t=2, pool1_stride_s=2, - pool1_stride_t=1, + pool1_stride_t=2, with_pool2=True, style='pytorch', frozen_stages=-1, @@ -544,7 +540,7 @@ def make_res_layer(block, Default: ``pytorch``. inflate (int | Sequence[int]): Determine whether to inflate for each block. Default: 1. - inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines + inflate_style (str): ``3x1x1`` or ``1x1x1``. which determines the kernel sizes and padding strides for conv1 and conv2 in each block. Default: '3x1x1'. non_local (int | Sequence[int]): Determine whether to apply @@ -879,7 +875,7 @@ class ResNet3dLayer(nn.Module): the first 1x1 conv layer. Default: 'pytorch'. all_frozen (bool): Frozen all modules in the layer. Default: False. inflate (int): Inflate Dims of each block. Default: 1. - inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the + inflate_style (str): ``3x1x1`` or ``1x1x1``. which determines the kernel sizes and padding strides for conv1 and conv2 in each block. Default: '3x1x1'. conv_cfg (dict): Config for conv layers. required keys are ``type`` @@ -935,7 +931,7 @@ def __init__(self, self.pretrained2d = pretrained2d self.stage = stage # stage index is 0 based - assert 0 <= stage <= 3 + assert stage >= 0 and stage <= 3 self.base_channels = base_channels self.spatial_stride = spatial_stride diff --git a/mmaction/models/backbones/resnet3d_csn.py b/mmaction/models/backbones/resnet3d_csn.py index 5d041d5450..4539dec01e 100644 --- a/mmaction/models/backbones/resnet3d_csn.py +++ b/mmaction/models/backbones/resnet3d_csn.py @@ -84,7 +84,7 @@ class ResNet3dCSN(ResNet3d): norm_cfg (dict): Config for norm layers. required keys are `type` and `requires_grad`. Default: dict(type='BN3d', requires_grad=True, eps=1e-3). - inflate_style (str): `3x1x1` or `3x3x3`. which determines the kernel + inflate_style (str): `3x1x1` or `1x1x1`. which determines the kernel sizes and padding strides for conv1 and conv2 in each block. Default: '3x3x3'. bottleneck_mode (str): Determine which ways to factorize a 3D diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py index be1ea1a2b2..45e9d5a7da 100644 --- a/mmaction/models/backbones/resnet3d_slowfast.py +++ b/mmaction/models/backbones/resnet3d_slowfast.py @@ -120,7 +120,7 @@ def make_res_layer(self, Default: ``pytorch``. inflate (int | Sequence[int]): Determine whether to inflate for each block. Default: 1. - inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines + inflate_style (str): ``3x1x1`` or ``1x1x1``. which determines the kernel sizes and padding strides for conv1 and conv2 in each block. Default: ``3x1x1``. non_local (int | Sequence[int]): Determine whether to apply diff --git a/mmaction/models/backbones/resnet_audio.py b/mmaction/models/backbones/resnet_audio.py index 63c0ff0d8a..d4fd9e1ece 100644 --- a/mmaction/models/backbones/resnet_audio.py +++ b/mmaction/models/backbones/resnet_audio.py @@ -180,7 +180,7 @@ def __init__(self, self.in_channels = in_channels self.base_channels = base_channels self.num_stages = num_stages - assert 1 <= num_stages <= 4 + assert num_stages >= 1 and num_stages <= 4 self.dilations = dilations self.conv1_kernel = conv1_kernel self.conv1_stride = conv1_stride @@ -222,8 +222,8 @@ def __init__(self, self.feat_dim = self.block.expansion * self.base_channels * 2**( len(self.stage_blocks) - 1) - @staticmethod - def make_res_layer(block, + def make_res_layer(self, + block, inplanes, planes, blocks, @@ -241,7 +241,7 @@ def make_res_layer(block, planes (int): Number of channels for the output feature in each block. blocks (int): Number of residual blocks. - stride (Sequence[int]): Strides of residual blocks of each stage. + strides (Sequence[int]): Strides of residual blocks of each stage. Default: (1, 2, 2, 2). dilation (int): Spacing between kernel elements. Default: 1. factorize (int | Sequence[int]): Determine whether to factorize diff --git a/mmaction/models/backbones/tanet.py b/mmaction/models/backbones/tanet.py index bb446ea23d..15d3487d1a 100644 --- a/mmaction/models/backbones/tanet.py +++ b/mmaction/models/backbones/tanet.py @@ -41,32 +41,32 @@ def __init__(self, block, num_segments, tam_cfg=dict()): 'on Bottleneck block.') def forward(self, x): - assert isinstance(self.block, Bottleneck) + if isinstance(self.block, Bottleneck): - def _inner_forward(x): - """Forward wrapper for utilizing checkpoint.""" - identity = x + def _inner_forward(x): + """Forward wrapper for utilizing checkpoint.""" + identity = x - out = self.block.conv1(x) - out = self.tam(out) - out = self.block.conv2(out) - out = self.block.conv3(out) + out = self.block.conv1(x) + out = self.tam(out) + out = self.block.conv2(out) + out = self.block.conv3(out) - if self.block.downsample is not None: - identity = self.block.downsample(x) + if self.block.downsample is not None: + identity = self.block.downsample(x) - out = out + identity + out = out + identity - return out + return out - if self.block.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) + if self.block.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) - out = self.block.relu(out) + out = self.block.relu(out) - return out + return out @BACKBONES.register_module() diff --git a/mmaction/models/common/lfb.py b/mmaction/models/common/lfb.py index f54ae36e31..e942dd165e 100644 --- a/mmaction/models/common/lfb.py +++ b/mmaction/models/common/lfb.py @@ -14,7 +14,7 @@ lmdb_imported = False -class LFB: +class LFB(object): """Long-Term Feature Bank (LFB). LFB is proposed in `Long-Term Feature Banks for Detailed Video diff --git a/mmaction/models/heads/bbox_head.py b/mmaction/models/heads/bbox_head.py index 3f3bfeead0..cd2cc52622 100644 --- a/mmaction/models/heads/bbox_head.py +++ b/mmaction/models/heads/bbox_head.py @@ -122,8 +122,8 @@ def forward(self, x): # We do not predict bbox, so return None return cls_score, None - @staticmethod - def get_targets(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg): + def get_targets(self, sampling_results, gt_bboxes, gt_labels, + rcnn_train_cfg): pos_proposals = [res.pos_bboxes for res in sampling_results] neg_proposals = [res.neg_bboxes for res in sampling_results] pos_gt_labels = [res.pos_gt_labels for res in sampling_results] @@ -131,8 +131,7 @@ def get_targets(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg): pos_gt_labels, rcnn_train_cfg) return cls_reg_targets - @staticmethod - def recall_prec(pred_vec, target_vec): + def recall_prec(self, pred_vec, target_vec): """ Args: pred_vec (tensor[N x C]): each element is either 0 or 1 @@ -145,7 +144,7 @@ def recall_prec(pred_vec, target_vec): prec = correct.sum(1) / (pred_vec.sum(1) + 1e-6) return recall.mean(), prec.mean() - def multi_label_accuracy(self, pred, target, thr=0.5): + def multilabel_accuracy(self, pred, target, thr=0.5): pred = pred.sigmoid() pred_vec = pred > thr # Target is 0 or 1, so using 0.5 as the borderline is OK @@ -190,7 +189,7 @@ def loss(self, F_loss = self.focal_alpha * (1 - pt)**self.focal_gamma * loss losses['loss_action_cls'] = torch.mean(F_loss) - recall_thr, prec_thr, recall_k, prec_k = self.multi_label_accuracy( + recall_thr, prec_thr, recall_k, prec_k = self.multilabel_accuracy( cls_score, labels, thr=0.5) losses['recall@thr=0.5'] = recall_thr losses['prec@thr=0.5'] = prec_thr diff --git a/mmaction/models/heads/lfb_infer_head.py b/mmaction/models/heads/lfb_infer_head.py index 69bdf8ae2a..1111b180c5 100644 --- a/mmaction/models/heads/lfb_infer_head.py +++ b/mmaction/models/heads/lfb_infer_head.py @@ -37,7 +37,7 @@ def __init__(self, temporal_pool_type='avg', spatial_pool_type='max'): super().__init__() - rank, _ = get_dist_info() + rank, world_size = get_dist_info() if rank == 0: if not osp.exists(lfb_prefix_path): print(f'lfb prefix path {lfb_prefix_path} does not exist. ' diff --git a/mmaction/models/heads/misc_head.py b/mmaction/models/heads/misc_head.py index 66e1b2c3b7..72cdaab547 100644 --- a/mmaction/models/heads/misc_head.py +++ b/mmaction/models/heads/misc_head.py @@ -75,7 +75,7 @@ def __init__(self, act_cfg=act_cfg) convs = [] - for _ in range(num_convs - 1): + for i in range(num_convs - 1): conv = ConvModule( out_channels, out_channels, diff --git a/mmaction/models/recognizers/__init__.py b/mmaction/models/recognizers/__init__.py index 9d0bccd56f..0557ec7a8f 100644 --- a/mmaction/models/recognizers/__init__.py +++ b/mmaction/models/recognizers/__init__.py @@ -2,5 +2,5 @@ from .base import BaseRecognizer from .recognizer2d import Recognizer2D from .recognizer3d import Recognizer3D - -__all__ = ['BaseRecognizer', 'Recognizer2D', 'Recognizer3D', 'AudioRecognizer'] +from .swintransformer3d import SwinTransformer3D +__all__ = ['BaseRecognizer', 'Recognizer2D', 'Recognizer3D','SwinTransformer3D', 'AudioRecognizer'] diff --git a/mmaction/models/recognizers/base.py b/mmaction/models/recognizers/base.py index 41164f3bd2..e76a170de3 100644 --- a/mmaction/models/recognizers/base.py +++ b/mmaction/models/recognizers/base.py @@ -124,11 +124,11 @@ def init_weights(self): """Initialize the model network weights.""" if self.backbone_from in ['mmcls', 'mmaction2']: self.backbone.init_weights() - elif self.backbone_from in ['torchvision', 'timm']: + elif self.backbone_from == 'torchvision': warnings.warn('We do not initialize weights for backbones in ' - f'{self.backbone_from}, since the weights for ' - f'backbones in {self.backbone_from} are initialized' - 'in their __init__ functions.') + 'torchvision, since the weights for backbones in ' + 'torchvision are initialized in their __init__ ' + 'functions. ') else: raise NotImplementedError('Unsupported backbone source ' f'{self.backbone_from}!') @@ -151,8 +151,6 @@ def extract_feat(self, imgs): if (hasattr(self.backbone, 'features') and self.backbone_from == 'torchvision'): x = self.backbone.features(imgs) - elif self.backbone_from == 'timm': - x = self.backbone.forward_features(imgs) else: x = self.backbone(imgs) return x diff --git a/mmaction/models/recognizers/recognizer2d.py b/mmaction/models/recognizers/recognizer2d.py index 6b4bedba04..d3444845f6 100644 --- a/mmaction/models/recognizers/recognizer2d.py +++ b/mmaction/models/recognizers/recognizer2d.py @@ -21,7 +21,7 @@ def forward_train(self, imgs, labels, **kwargs): x = self.extract_feat(imgs) - if self.backbone_from in ['torchvision', 'timm']: + if self.backbone_from == 'torchvision': if len(x.shape) == 4 and (x.shape[2] > 1 or x.shape[3] > 1): # apply adaptive avg pooling x = nn.AdaptiveAvgPool2d(1)(x) @@ -55,7 +55,7 @@ def _do_test(self, imgs): x = self.extract_feat(imgs) - if self.backbone_from in ['torchvision', 'timm']: + if self.backbone_from == 'torchvision': if len(x.shape) == 4 and (x.shape[2] > 1 or x.shape[3] > 1): # apply adaptive avg pooling x = nn.AdaptiveAvgPool2d(1)(x) diff --git a/mmaction/utils/__init__.py b/mmaction/utils/__init__.py index d1478a2630..8cb60fcd7a 100644 --- a/mmaction/utils/__init__.py +++ b/mmaction/utils/__init__.py @@ -5,11 +5,9 @@ from .misc import get_random_string, get_shm_dir, get_thread_id from .module_hooks import register_module_hooks from .precise_bn import PreciseBNHook -from .optimizer import DistOptimizerHook - __all__ = [ 'get_root_logger', 'collect_env', 'get_random_string', 'get_thread_id', 'get_shm_dir', 'GradCAM', 'PreciseBNHook', 'import_module_error_class', - 'import_module_error_func', 'register_module_hooks', 'DistOptimizerHook' + 'import_module_error_func', 'register_module_hooks' ] diff --git a/mmaction/utils/decorators.py b/mmaction/utils/decorators.py index 727fa61df3..798bd2f4ff 100644 --- a/mmaction/utils/decorators.py +++ b/mmaction/utils/decorators.py @@ -10,6 +10,7 @@ def decorate(func): def new_func(*args, **kwargs): raise ImportError( f'Please install {module_name} to use {func.__name__}.') + return func(*args, **kwargs) return new_func diff --git a/mmaction/utils/precise_bn.py b/mmaction/utils/precise_bn.py index 2751b2e736..c01bd4d109 100644 --- a/mmaction/utils/precise_bn.py +++ b/mmaction/utils/precise_bn.py @@ -30,7 +30,10 @@ def is_parallel_module(module): """ parallels = (DataParallel, DistributedDataParallel, MMDistributedDataParallel) - return bool(isinstance(module, parallels)) + if isinstance(module, parallels): + return True + else: + return False @torch.no_grad() From 37910ef3141c7b2eef76544f9ec8bdf26ec94c7d Mon Sep 17 00:00:00 2001 From: vidit Date: Sun, 20 Mar 2022 06:22:19 +0530 Subject: [PATCH 2/2] Fixed mmaction folder to make Swin-T working and made the installation process much faster and easier --- environment.yml | 424 +++++++++++ mmaction/core/lr/__init__.py | 3 + mmaction/core/lr/tin_lr_hook.py | 39 + .../models/recognizers/swintransformer3d.py | 681 ++++++++++++++++++ setup.sh | 7 + swint.py | 92 +++ 6 files changed, 1246 insertions(+) create mode 100644 environment.yml create mode 100644 mmaction/core/lr/__init__.py create mode 100644 mmaction/core/lr/tin_lr_hook.py create mode 100644 mmaction/models/recognizers/swintransformer3d.py create mode 100644 setup.sh create mode 100644 swint.py diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000..deed242848 --- /dev/null +++ b/environment.yml @@ -0,0 +1,424 @@ +name: swint +channels: + - anaconda + - defaults + - conda-forge +dependencies: + - _anaconda_depends=2021.11=py37_0 + - _libgcc_mutex=0.1=main + - _openmp_mutex=4.5=1_gnu + - alabaster=0.7.12=py37_0 + - anaconda=custom=py37_1 + - anaconda-client=1.9.0=py37h06a4308_0 + - anaconda-project=0.10.2=pyhd3eb1b0_0 + - anyio=3.5.0=py37h06a4308_0 + - appdirs=1.4.4=pyhd3eb1b0_0 + - argcomplete=1.12.3=pyhd3eb1b0_0 + - argh=0.26.2=py37_0 + - argon2-cffi=21.3.0=pyhd3eb1b0_0 + - argon2-cffi-bindings=21.2.0=py37h7f8727e_0 + - arrow=0.13.1=py37_0 + - asn1crypto=1.4.0=py_0 + - astroid=2.6.6=py37h06a4308_0 + - astropy=4.3.1=py37h09021b7_0 + - async_generator=1.10=py37h28b3542_0 + - atomicwrites=1.4.0=py_0 + - attrs=21.4.0=pyhd3eb1b0_0 + - autopep8=1.6.0=pyhd3eb1b0_0 + - babel=2.9.1=pyhd3eb1b0_0 + - backcall=0.2.0=pyhd3eb1b0_0 + - backports=1.1=pyhd3eb1b0_0 + - backports.shutil_get_terminal_size=1.0.0=pyhd3eb1b0_3 + - beautifulsoup4=4.10.0=pyh06a4308_0 + - binaryornot=0.4.4=pyhd3eb1b0_1 + - bitarray=2.3.5=py37h7f8727e_0 + - bkcharts=0.2=py37_0 + - black=19.10b0=py_0 + - blas=1.0=mkl + - bleach=4.1.0=pyhd3eb1b0_0 + - blosc=1.21.0=h8c45485_0 + - bokeh=2.4.2=py37h06a4308_0 + - boto=2.49.0=py37_0 + - bottleneck=1.3.2=py37heb32a55_1 + - brotli=1.0.9=he6710b0_2 + - brotlipy=0.7.0=py37h27cfd23_1003 + - brunsli=0.1=h2531618_0 + - bzip2=1.0.8=h7b6447c_0 + - c-ares=1.18.1=h7f8727e_0 + - ca-certificates=2020.10.14=0 + - cairo=1.16.0=hf32fb01_1 + - certifi=2020.6.20=py37_0 + - cffi=1.15.0=py37hd667e15_1 + - cfitsio=3.470=hf0d0db6_6 + - chardet=4.0.0=py37h06a4308_1003 + - charls=2.2.0=h2531618_0 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.0.4=py37h06a4308_0 + - cloudpickle=2.0.0=pyhd3eb1b0_0 + - clyent=1.2.2=py37_1 + - colorama=0.4.4=pyhd3eb1b0_0 + - conda=4.11.0=py37h06a4308_0 + - conda-content-trust=0.1.1=pyhd3eb1b0_0 + - conda-pack=0.6.0=pyhd3eb1b0_0 + - conda-package-handling=1.7.3=py37h27cfd23_1 + - conda-token=0.3.0=pyhd3eb1b0_0 + - contextlib2=0.6.0.post1=pyhd3eb1b0_0 + - cookiecutter=1.7.2=pyhd3eb1b0_0 + - curl=7.80.0=h7f8727e_0 + - cycler=0.11.0=pyhd3eb1b0_0 + - cython=0.29.25=py37hdbfa776_0 + - cytoolz=0.11.0=py37h7b6447c_0 + - daal4py=2021.5.0=py37h78b71dc_0 + - dal=2021.5.1=h06a4308_803 + - dask=2021.10.0=pyhd3eb1b0_0 + - dask-core=2021.10.0=pyhd3eb1b0_0 + - dataclasses=0.8=pyh6d0b6a4_7 + - dbus=1.13.18=hb2f20db_0 + - debugpy=1.5.1=py37h295c915_0 + - decorator=4.4.2=py_0 + - defusedxml=0.7.1=pyhd3eb1b0_0 + - diff-match-patch=20200713=pyhd3eb1b0_0 + - distributed=2021.10.0=py37h06a4308_0 + - docutils=0.17.1=py37h06a4308_1 + - entrypoints=0.3=py37_0 + - et_xmlfile=1.1.0=py37h06a4308_0 + - expat=2.4.4=h295c915_0 + - fastcache=1.1.0=py37h7b6447c_0 + - filelock=3.4.2=pyhd3eb1b0_0 + - flake8=3.9.2=pyhd3eb1b0_0 + - flask=1.1.2=pyhd3eb1b0_0 + - fontconfig=2.13.1=h6c09931_0 + - freetype=2.11.0=h70c0345_0 + - fribidi=1.0.10=h7b6447c_0 + - fsspec=2022.1.0=pyhd3eb1b0_0 + - get_terminal_size=1.0.0=haa9412d_0 + - gevent=21.8.0=py37h7f8727e_1 + - giflib=5.2.1=h7b6447c_0 + - glib=2.69.1=h4ff587b_1 + - glob2=0.7=pyhd3eb1b0_0 + - gmp=6.2.1=h2531618_2 + - gmpy2=2.1.2=py37heeb90bb_0 + - graphite2=1.3.14=h23475e2_0 + - greenlet=1.1.1=py37h295c915_0 + - gst-plugins-base=1.14.0=h8213a91_2 + - gstreamer=1.14.0=h28cd5cc_2 + - h5py=2.10.0=py37h7918eee_0 + - harfbuzz=2.8.1=h6f93f22_0 + - hdf5=1.10.4=hb1b8bf9_0 + - heapdict=1.0.1=pyhd3eb1b0_0 + - html5lib=1.1=pyhd3eb1b0_0 + - icu=58.2=he6710b0_3 + - idna=3.3=pyhd3eb1b0_0 + - imagecodecs=2021.8.26=py37h4cda21f_0 + - imageio=2.9.0=pyhd3eb1b0_0 + - imagesize=1.3.0=pyhd3eb1b0_0 + - importlib-metadata=4.8.2=py37h06a4308_0 + - importlib_metadata=4.8.2=hd3eb1b0_0 + - inflection=0.5.1=py37h06a4308_0 + - iniconfig=1.1.1=pyhd3eb1b0_0 + - intel-openmp=2021.4.0=h06a4308_3561 + - intervaltree=3.1.0=pyhd3eb1b0_0 + - ipykernel=6.4.1=py37h06a4308_1 + - ipython=7.31.1=py37h06a4308_0 + - ipython_genutils=0.2.0=pyhd3eb1b0_1 + - ipywidgets=7.6.5=pyhd3eb1b0_1 + - itsdangerous=2.0.1=pyhd3eb1b0_0 + - jbig=2.1=hdba287a_0 + - jdcal=1.4.1=pyhd3eb1b0_0 + - jedi=0.18.1=py37h06a4308_1 + - jeepney=0.7.1=pyhd3eb1b0_0 + - jinja2=2.11.3=pyhd3eb1b0_0 + - jinja2-time=0.2.0=pyhd3eb1b0_2 + - joblib=1.1.0=pyhd3eb1b0_0 + - jpeg=9d=h7f8727e_0 + - json5=0.9.6=pyhd3eb1b0_0 + - jsonschema=3.2.0=pyhd3eb1b0_2 + - jupyter=1.0.0=py37_7 + - jupyter_client=6.1.12=pyhd3eb1b0_0 + - jupyter_console=6.4.0=pyhd3eb1b0_0 + - jupyter_core=4.9.1=py37h06a4308_0 + - jupyter_server=1.13.5=pyhd3eb1b0_0 + - jupyterlab=3.2.9=pyhd3eb1b0_0 + - jupyterlab_pygments=0.1.2=py_0 + - jupyterlab_server=2.10.3=pyhd3eb1b0_1 + - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1 + - jxrlib=1.1=h7b6447c_2 + - keyring=23.4.0=py37h06a4308_0 + - kiwisolver=1.3.2=py37h295c915_0 + - krb5=1.19.2=hac12032_0 + - lazy-object-proxy=1.6.0=py37h27cfd23_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.35.1=h7274673_9 + - lerc=3.0=h295c915_0 + - libaec=1.0.4=he6710b0_1 + - libarchive=3.4.2=h62408e4_0 + - libcurl=7.80.0=h0b77cf5_0 + - libdeflate=1.8=h7f8727e_5 + - libedit=3.1.20210910=h7f8727e_0 + - libev=4.33=h7f8727e_1 + - libffi=3.3=he6710b0_2 + - libgcc-ng=9.3.0=h5101ec6_17 + - libgfortran-ng=7.5.0=ha8ba4b0_17 + - libgfortran4=7.5.0=ha8ba4b0_17 + - libgomp=9.3.0=h5101ec6_17 + - liblief=0.10.1=he6710b0_0 + - libllvm11=11.1.0=h3826bc1_0 + - libnghttp2=1.46.0=hce63b2e_0 + - libpng=1.6.37=hbc83047_0 + - libsodium=1.0.18=h7b6447c_0 + - libspatialindex=1.9.3=h2531618_0 + - libssh2=1.9.0=h1ba5d50_1 + - libstdcxx-ng=9.3.0=hd4cf53a_17 + - libtiff=4.2.0=h85742a9_0 + - libtool=2.4.6=h295c915_1008 + - libuuid=1.0.3=h7f8727e_2 + - libuv=1.40.0=h7b6447c_0 + - libwebp=1.2.2=h55f646e_0 + - libwebp-base=1.2.2=h7f8727e_0 + - libxcb=1.14=h7b6447c_0 + - libxml2=2.9.12=h03d6c58_0 + - libxslt=1.1.34=hc22bd24_0 + - libzopfli=1.0.3=he6710b0_0 + - llvmlite=0.37.0=py37h295c915_1 + - locket=0.2.1=py37h06a4308_1 + - lxml=4.7.1=py37h1f438cf_1 + - lz4-c=1.9.3=h295c915_1 + - lzo=2.10=h7b6447c_2 + - markupsafe=1.1.1=py37h14c3975_1 + - matplotlib=3.5.1=py37h06a4308_0 + - matplotlib-base=3.5.1=py37ha18d171_0 + - matplotlib-inline=0.1.2=pyhd3eb1b0_2 + - mccabe=0.6.1=py37_1 + - mistune=0.8.4=py37h14c3975_1001 + - mkl=2021.4.0=h06a4308_640 + - mkl-service=2.4.0=py37h7f8727e_0 + - mkl_fft=1.3.1=py37hd3c417c_0 + - mkl_random=1.2.2=py37h51133e4_0 + - mock=4.0.3=pyhd3eb1b0_0 + - more-itertools=8.12.0=pyhd3eb1b0_0 + - mpc=1.1.0=h10f8cd9_1 + - mpfr=4.0.2=hb69a4c5_1 + - mpi=1.0=mpich + - mpich=3.3.2=hc856adb_0 + - mpmath=1.2.1=py37h06a4308_0 + - msgpack-python=1.0.2=py37hff7bd54_1 + - multipledispatch=0.6.0=py37_0 + - munkres=1.1.4=py_0 + - mypy_extensions=0.4.3=py37h06a4308_1 + - nbclassic=0.3.5=pyhd3eb1b0_0 + - nbclient=0.5.11=pyhd3eb1b0_0 + - nbconvert=6.3.0=py37h06a4308_0 + - nbformat=5.1.3=pyhd3eb1b0_0 + - ncurses=6.3=h7f8727e_2 + - nest-asyncio=1.5.1=pyhd3eb1b0_0 + - networkx=2.6.3=pyhd3eb1b0_0 + - nltk=3.7=pyhd3eb1b0_0 + - nose=1.3.7=pyhd3eb1b0_1008 + - notebook=6.4.8=py37h06a4308_0 + - numba=0.54.1=py37h51133e4_0 + - numexpr=2.8.1=py37h6abb31d_0 + - numpy=1.20.3=py37hf144106_0 + - numpy-base=1.20.3=py37h74d4b33_0 + - numpydoc=1.2=pyhd3eb1b0_0 + - olefile=0.46=py37_0 + - openjpeg=2.4.0=h3ad879b_0 + - openpyxl=3.0.9=pyhd3eb1b0_0 + - openssl=1.1.1m=h7f8727e_0 + - packaging=21.3=pyhd3eb1b0_0 + - pandas=1.3.4=py37h8c16a72_0 + - pandocfilters=1.5.0=pyhd3eb1b0_0 + - pango=1.45.3=hd140c19_0 + - parso=0.8.3=pyhd3eb1b0_0 + - partd=1.2.0=pyhd3eb1b0_0 + - patchelf=0.13=h295c915_0 + - path=16.2.0=pyhd3eb1b0_0 + - path.py=12.5.0=hd3eb1b0_0 + - pathlib2=2.3.6=py37h06a4308_2 + - pathspec=0.7.0=py_0 + - patsy=0.5.2=py37h06a4308_1 + - pcre=8.45=h295c915_0 + - pep8=1.7.1=py37_0 + - pexpect=4.8.0=pyhd3eb1b0_3 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pillow=9.0.1=py37h22f2fdc_0 + - pip + - pixman=0.40.0=h7f8727e_1 + - pkginfo=1.8.2=pyhd3eb1b0_0 + - pluggy=1.0.0=py37h06a4308_0 + - ply=3.11=py37_0 + - poyo=0.5.0=pyhd3eb1b0_0 + - prometheus_client=0.13.1=pyhd3eb1b0_0 + - prompt-toolkit=3.0.20=pyhd3eb1b0_0 + - prompt_toolkit=3.0.20=hd3eb1b0_0 + - psutil=5.8.0=py37h27cfd23_1 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - py=1.11.0=pyhd3eb1b0_0 + - py-lief=0.10.1=py37h403a769_0 + - pycodestyle=2.7.0=pyhd3eb1b0_0 + - pycosat=0.6.3=py37h27cfd23_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pycrypto=2.6.1=py37h7b6447c_10 + - pycurl=7.44.1=py37h8f2d780_1 + - pydocstyle=6.1.1=pyhd3eb1b0_0 + - pyerfa=2.0.0=py37h27cfd23_0 + - pyflakes=2.3.1=pyhd3eb1b0_0 + - pygments=2.11.2=pyhd3eb1b0_0 + - pylint=2.9.6=py37h06a4308_1 + - pyls-spyder=0.4.0=pyhd3eb1b0_0 + - pyodbc=4.0.32=py37h295c915_0 + - pyopenssl=22.0.0=pyhd3eb1b0_0 + - pyparsing=3.0.4=pyhd3eb1b0_0 + - pyqt=5.9.2=py37h05f1152_2 + - pyrsistent=0.18.0=py37heee7806_0 + - pysocks=1.7.1=py37_1 + - pytables=3.6.1=py37h71ec239_0 + - pytest=6.2.5=py37h06a4308_2 + - python=3.7.11=h12debd9_0 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python-libarchive-c=2.9=pyhd3eb1b0_1 + - python-lsp-black=1.0.0=pyhd3eb1b0_0 + - python-lsp-jsonrpc=1.0.0=pyhd3eb1b0_0 + - python-lsp-server=1.2.4=pyhd3eb1b0_0 + - python-slugify=5.0.2=pyhd3eb1b0_0 + - pytz=2021.3=pyhd3eb1b0_0 + - pywavelets=1.1.1=py37h7b6447c_2 + - pyxdg=0.27=pyhd3eb1b0_0 + - pyyaml=6.0=py37h7f8727e_1 + - pyzmq=22.3.0=py37h295c915_2 + - qdarkstyle=3.0.2=pyhd3eb1b0_0 + - qstylizer=0.1.10=pyhd3eb1b0_0 + - qt=5.9.7=h5867ecd_1 + - qtawesome=1.0.3=pyhd3eb1b0_0 + - qtconsole=5.2.2=pyhd3eb1b0_0 + - qtpy=1.11.2=pyhd3eb1b0_0 + - readline=8.1.2=h7f8727e_1 + - regex=2021.11.2=py37h7f8727e_0 + - requests=2.27.1=pyhd3eb1b0_0 + - ripgrep=12.1.1=0 + - rope=0.22.0=pyhd3eb1b0_0 + - rtree=0.9.7=py37h06a4308_1 + - ruamel_yaml=0.15.100=py37h27cfd23_0 + - scikit-image=0.18.3=py37h51133e4_0 + - scikit-learn=1.0.2=py37h51133e4_1 + - scikit-learn-intelex=2021.5.0=py37h06a4308_0 + - scipy=1.7.3=py37hc147768_0 + - seaborn=0.11.2=pyhd3eb1b0_0 + - secretstorage=3.3.1=py37h06a4308_0 + - send2trash=1.8.0=pyhd3eb1b0_1 + - setuptools=58.0.4=py37h06a4308_0 + - simplegeneric=0.8.1=py37_2 + - singledispatch=3.7.0=pyhd3eb1b0_1001 + - sip=4.19.8=py37hf484d3e_0 + - six=1.16.0=pyhd3eb1b0_1 + - snappy=1.1.8=he6710b0_0 + - sniffio=1.2.0=py37h06a4308_1 + - snowballstemmer=2.2.0=pyhd3eb1b0_0 + - sortedcollections=2.1.0=pyhd3eb1b0_0 + - sortedcontainers=2.4.0=pyhd3eb1b0_0 + - soupsieve=2.3.1=pyhd3eb1b0_0 + - sphinx=4.4.0=pyhd3eb1b0_0 + - sphinxcontrib=1.0=py37_1 + - sphinxcontrib-applehelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-devhelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-htmlhelp=2.0.0=pyhd3eb1b0_0 + - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0 + - sphinxcontrib-qthelp=1.0.3=pyhd3eb1b0_0 + - sphinxcontrib-serializinghtml=1.1.5=pyhd3eb1b0_0 + - sphinxcontrib-websupport=1.2.4=py_0 + - spyder=5.1.5=py37h06a4308_1 + - spyder-kernels=2.1.3=py37h06a4308_0 + - sqlalchemy=1.4.27=py37h7f8727e_0 + - sqlite=3.37.2=hc218d9a_0 + - statsmodels=0.12.2=py37h27cfd23_0 + - sympy=1.9=py37h06a4308_0 + - tbb=2021.5.0=hd09550d_0 + - tbb4py=2021.5.0=py37hd09550d_0 + - tblib=1.7.0=pyhd3eb1b0_0 + - terminado=0.13.1=py37h06a4308_0 + - testpath=0.5.0=pyhd3eb1b0_0 + - text-unidecode=1.3=pyhd3eb1b0_0 + - textdistance=4.2.1=pyhd3eb1b0_0 + - threadpoolctl=2.2.0=pyh0d69192_0 + - three-merge=0.1.1=pyhd3eb1b0_0 + - tifffile=2021.7.2=pyhd3eb1b0_2 + - tinycss=0.4=pyhd3eb1b0_1002 + - tk=8.6.11=h1ccaba5_0 + - toml=0.10.2=pyhd3eb1b0_0 + - toolz=0.11.2=pyhd3eb1b0_0 + - tornado=6.1=py37h27cfd23_0 + - tqdm=4.62.3=pyhd3eb1b0_1 + - traitlets=5.1.1=pyhd3eb1b0_0 + - typed-ast=1.4.3=py37h7f8727e_1 + - typing-extensions=3.10.0.2=hd3eb1b0_0 + - typing_extensions=3.10.0.2=pyh06a4308_0 + - ujson=4.2.0=py37h295c915_0 + - unicodecsv=0.14.1=py37_0 + - unidecode=1.2.0=pyhd3eb1b0_0 + - unixodbc=2.3.9=h7b6447c_0 + - urllib3=1.26.8=pyhd3eb1b0_0 + - watchdog=2.1.6=py37h06a4308_0 + - wcwidth=0.2.5=pyhd3eb1b0_0 + - webencodings=0.5.1=py37_1 + - websocket-client=0.58.0=py37h06a4308_4 + - werkzeug=2.0.3=pyhd3eb1b0_0 + - wheel=0.37.1=pyhd3eb1b0_0 + - whichcraft=0.6.1=pyhd3eb1b0_0 + - widgetsnbextension=3.5.2=py37h06a4308_0 + - wrapt=1.12.1=py37h7b6447c_1 + - wurlitzer=3.0.2=py37h06a4308_0 + - xlrd=2.0.1=pyhd3eb1b0_0 + - xlsxwriter=3.0.2=pyhd3eb1b0_0 + - xlwt=1.3.0=py37_0 + - xz=5.2.5=h7b6447c_0 + - yaml=0.2.5=h7b6447c_0 + - yapf=0.31.0=pyhd3eb1b0_0 + - zeromq=4.3.4=h2531618_0 + - zfp=0.5.5=h295c915_6 + - zict=2.0.0=pyhd3eb1b0_0 + - zipp=3.7.0=pyhd3eb1b0_0 + - zlib=1.2.11=h7f8727e_4 + - zope=1.0=py37_1 + - zope.event=4.5.0=py37_0 + - zope.interface=5.4.0=py37h7f8727e_0 + - zstd=1.4.9=haebb681_0 + - pip: + - addict==2.4.0 + - audioread==2.1.9 + - av==8.1.0 + - coverage==6.3.2 + - cryptography==2.8 + - decord==0.6.0 + - einops==0.4.1 + - flatbuffers==2.0 + - fonttools==4.29.1 + - imageio-ffmpeg==0.4.5 + - imgaug==0.4.0 + - interrogate==1.5.0 + - isort==4.3.21 + - librosa==0.9.1 + - lmdb==1.3.0 + - mmaction2==0.15.0 + - mmcv==1.3.1 + - moviepy==1.0.3 + - onnx==1.11.0 + - onnxruntime==1.10.0 + - opencv-contrib-python==4.5.5.62 + - opencv-python-headless==4.5.5.62 + - pep517==0.12.0 + - pip==22.0.3 + - pooch==1.6.0 + - proglog==0.1.9 + - protobuf==3.19.4 + - pytest-runner==6.0.0 + - pyturbojpeg==1.6.5 + - resampy==0.2.2 + - shapely==1.8.1.post1 + - soundfile==0.10.3.post1 + - tabulate==0.8.9 + - timm==0.5.4 + - tomli==2.0.1 + - torch==1.10.2 + - torchvision==0.11.3 + - webcolors==1.11.1 + - xdoctest==0.15.10 +prefix: /srv/conda/envs/swint diff --git a/mmaction/core/lr/__init__.py b/mmaction/core/lr/__init__.py new file mode 100644 index 0000000000..f2a29754b1 --- /dev/null +++ b/mmaction/core/lr/__init__.py @@ -0,0 +1,3 @@ +from .tin_lr_hook import TINLrUpdaterHook + +__all__ = ['TINLrUpdaterHook'] diff --git a/mmaction/core/lr/tin_lr_hook.py b/mmaction/core/lr/tin_lr_hook.py new file mode 100644 index 0000000000..4fededd585 --- /dev/null +++ b/mmaction/core/lr/tin_lr_hook.py @@ -0,0 +1,39 @@ +from mmcv.runner import HOOKS, LrUpdaterHook +from mmcv.runner.hooks.lr_updater import annealing_cos + + +@HOOKS.register_module() +class TINLrUpdaterHook(LrUpdaterHook): + + def __init__(self, min_lr, **kwargs): + self.min_lr = min_lr + super(TINLrUpdaterHook, self).__init__(**kwargs) + + def get_warmup_lr(self, cur_iters): + if self.warmup == 'linear': + # 'linear' warmup is rewritten according to TIN repo: + # https://github.com/deepcs233/TIN/blob/master/main.py#L409-L412 + k = (cur_iters / self.warmup_iters) * ( + 1 - self.warmup_ratio) + self.warmup_ratio + warmup_lr = [_lr * k for _lr in self.regular_lr] + elif self.warmup == 'constant': + warmup_lr = [_lr * self.warmup_ratio for _lr in self.regular_lr] + elif self.warmup == 'exp': + k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters) + warmup_lr = [_lr * k for _lr in self.regular_lr] + return warmup_lr + + def get_lr(self, runner, base_lr): + if self.by_epoch: + progress = runner.epoch + max_progress = runner.max_epochs + else: + progress = runner.iter + max_progress = runner.max_iters + + target_lr = self.min_lr + if self.warmup is not None: + progress = progress - self.warmup_iters + max_progress = max_progress - self.warmup_iters + factor = progress / max_progress + return annealing_cos(base_lr, target_lr, factor) diff --git a/mmaction/models/recognizers/swintransformer3d.py b/mmaction/models/recognizers/swintransformer3d.py new file mode 100644 index 0000000000..9f41a63e82 --- /dev/null +++ b/mmaction/models/recognizers/swintransformer3d.py @@ -0,0 +1,681 @@ +''' +Credit to the official implementation: https://github.com/SwinTransformer/Video-Swin-Transformer +''' + +import torch.nn as nn +import torch.nn.functional as F +# import torch.utils.checkpoint as checkpoint +import numpy as np +from timm.models.layers import DropPath, trunc_normal_ + +from functools import reduce, lru_cache +from operator import mul +from einops import rearrange + + +# def get_root_logger(log_file=None, log_level=logging.INFO): +# """Use ``get_logger`` method in mmcv to get the root logger. +# The logger will be initialized if it has not been initialized. By default a +# StreamHandler will be added. If ``log_file`` is specified, a FileHandler +# will also be added. The name of the root logger is the top-level package +# name, e.g., "mmaction". +# Args: +# log_file (str | None): The log filename. If specified, a FileHandler +# will be added to the root logger. +# log_level (int): The root logger level. Note that only the process of +# rank 0 is affected, while other processes will set the level to +# "Error" and be silent most of the time. +# Returns: +# :obj:`logging.Logger`: The root logger. +# """ +# return get_logger(__name__.split('.')[0], log_file, log_level) + + +class Mlp(nn.Module): + """ Multilayer perceptron.""" + + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, D, H, W, C) + window_size (tuple[int]): window size + Returns: + windows: (B*num_windows, window_size*window_size, C) + """ + B, D, H, W, C = x.shape + x = x.view(B, D // window_size[0], window_size[0], H // window_size[1], window_size[1], W // window_size[2], + window_size[2], C) + windows = x.permute(0, 1, 3, 5, 2, 4, 6, 7).contiguous().view(-1, reduce(mul, window_size), C) + return windows + + +def window_reverse(windows, window_size, B, D, H, W): + """ + Args: + windows: (B*num_windows, window_size, window_size, C) + window_size (tuple[int]): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, D, H, W, C) + """ + x = windows.view(B, D // window_size[0], H // window_size[1], W // window_size[2], window_size[0], window_size[1], + window_size[2], -1) + x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1) + return x + + +def get_window_size(x_size, window_size, shift_size=None): + use_window_size = list(window_size) + if shift_size is not None: + use_shift_size = list(shift_size) + for i in range(len(x_size)): + if x_size[i] <= window_size[i]: + use_window_size[i] = x_size[i] + if shift_size is not None: + use_shift_size[i] = 0 + + if shift_size is None: + return tuple(use_window_size) + else: + return tuple(use_window_size), tuple(use_shift_size) + + +class WindowAttention3D(nn.Module): + """ Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The temporal length, height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, window_size, num_heads, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wd, Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1) * (2 * window_size[2] - 1), + num_heads)) # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_d = torch.arange(self.window_size[0]) + coords_h = torch.arange(self.window_size[1]) + coords_w = torch.arange(self.window_size[2]) + coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w)) # 3, Wd, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 3, Wd*Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 3, Wd*Wh*Ww, Wd*Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wd*Wh*Ww, Wd*Wh*Ww, 3 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 2] += self.window_size[2] - 1 + + relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1) + relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1) + relative_position_index = relative_coords.sum(-1) # Wd*Wh*Ww, Wd*Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, N, N) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # B_, nH, N, C + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index[:N, :N].reshape(-1)].reshape( + N, N, -1) # Wd*Wh*Ww,Wd*Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wd*Wh*Ww, Wd*Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) # B_, nH, N, N + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock3D(nn.Module): + """ Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (tuple[int]): Window size. + shift_size (tuple[int]): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, num_heads, window_size=(2, 7, 7), shift_size=(0, 0, 0), + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + # self.use_checkpoint=use_checkpoint + + assert 0 <= self.shift_size[0] < self.window_size[0], "shift_size must in 0-window_size" + assert 0 <= self.shift_size[1] < self.window_size[1], "shift_size must in 0-window_size" + assert 0 <= self.shift_size[2] < self.window_size[2], "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention3D( + dim, window_size=self.window_size, num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward_part1(self, x, mask_matrix): + B, D, H, W, C = x.shape + window_size, shift_size = get_window_size((D, H, W), self.window_size, self.shift_size) + + x = self.norm1(x) + # pad feature maps to multiples of window size + pad_l = pad_t = pad_d0 = 0 + pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0] + pad_b = (window_size[1] - H % window_size[1]) % window_size[1] + pad_r = (window_size[2] - W % window_size[2]) % window_size[2] + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1)) + _, Dp, Hp, Wp, _ = x.shape + # cyclic shift + if any(i > 0 for i in shift_size): + shifted_x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + # partition windows + x_windows = window_partition(shifted_x, window_size) # B*nW, Wd*Wh*Ww, C + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=attn_mask) # B*nW, Wd*Wh*Ww, C + # merge windows + attn_windows = attn_windows.view(-1, *(window_size + (C,))) + shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp, Wp) # B D' H' W' C + # reverse cyclic shift + if any(i > 0 for i in shift_size): + x = torch.roll(shifted_x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3)) + else: + x = shifted_x + + if pad_d1 > 0 or pad_r > 0 or pad_b > 0: + x = x[:, :D, :H, :W, :].contiguous() + return x + + def forward_part2(self, x): + return self.drop_path(self.mlp(self.norm2(x))) + + def forward(self, x, mask_matrix): + """ Forward function. + Args: + x: Input feature, tensor size (B, D, H, W, C). + mask_matrix: Attention mask for cyclic shift. + """ + + shortcut = x + # if self.use_checkpoint: + # x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix) + # else: + x = self.forward_part1(x, mask_matrix) + x = shortcut + self.drop_path(x) + + # if self.use_checkpoint: + # x = x + checkpoint.checkpoint(self.forward_part2, x) + # else: + x = x + self.forward_part2(x) + + return x + + +class PatchMerging(nn.Module): + """ Patch Merging Layer + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ Forward function. + Args: + x: Input feature, tensor size (B, D, H, W, C). + """ + B, D, H, W, C = x.shape + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C + x1 = x[:, :, 1::2, 0::2, :] # B D H/2 W/2 C + x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C + x3 = x[:, :, 1::2, 1::2, :] # B D H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B D H/2 W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +# cache each stage results +@lru_cache() +def compute_mask(D, H, W, window_size, shift_size, device): + img_mask = torch.zeros((1, D, H, W, 1), device=device) # 1 Dp Hp Wp 1 + cnt = 0 + for d in slice(-window_size[0]), slice(-window_size[0], -shift_size[0]), slice(-shift_size[0], None): + for h in slice(-window_size[1]), slice(-window_size[1], -shift_size[1]), slice(-shift_size[1], None): + for w in slice(-window_size[2]), slice(-window_size[2], -shift_size[2]), slice(-shift_size[2], None): + img_mask[:, d, h, w, :] = cnt + cnt += 1 + mask_windows = window_partition(img_mask, window_size) # nW, ws[0]*ws[1]*ws[2], 1 + mask_windows = mask_windows.squeeze(-1) # nW, ws[0]*ws[1]*ws[2] + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + return attn_mask + + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (tuple[int]): Local window size. Default: (1,7,7). + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + """ + + def __init__(self, + dim, + depth, + num_heads, + window_size=(1, 7, 7), + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False): + super().__init__() + self.window_size = window_size + self.shift_size = tuple(i // 2 for i in window_size) + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock3D( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + use_checkpoint=use_checkpoint, + ) + for i in range(depth)]) + + self.downsample = downsample + if self.downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + + def forward(self, x): + """ Forward function. + Args: + x: Input feature, tensor size (B, C, D, H, W). + """ + # calculate attention mask for SW-MSA + B, C, D, H, W = x.shape + window_size, shift_size = get_window_size((D, H, W), self.window_size, self.shift_size) + x = rearrange(x, 'b c d h w -> b d h w c') + Dp = int(np.ceil(D / window_size[0])) * window_size[0] + Hp = int(np.ceil(H / window_size[1])) * window_size[1] + Wp = int(np.ceil(W / window_size[2])) * window_size[2] + attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device) + for blk in self.blocks: + x = blk(x, attn_mask) + x = x.view(B, D, H, W, -1) + + if self.downsample is not None: + x = self.downsample(x) + x = rearrange(x, 'b d h w c -> b c d h w') + return x + + +class PatchEmbed3D(nn.Module): + """ Video to Patch Embedding. + Args: + patch_size (int): Patch token size. Default: (2,4,4). + in_chans (int): Number of input video channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=(2, 4, 4), in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, D, H, W = x.size() + if W % self.patch_size[2] != 0: + x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) + if H % self.patch_size[1] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) + if D % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0])) + + x = self.proj(x) # B C D Wh Ww + if self.norm is not None: + D, Wh, Ww = x.size(2), x.size(3), x.size(4) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww) + + return x + + +import torch +from torch import nn +from mmaction.models.heads.i3d_head import I3DHead + + +class SwinTransformer3D(nn.Module): + """ Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + Args: + patch_size (int | tuple(int)): Patch size. Default: (4,4,4). + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer: Normalization layer. Default: nn.LayerNorm. + patch_norm (bool): If True, add normalization after patch embedding. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + """ + + def __init__(self, + pretrained=None, + pretrained2d=True, + patch_size=(4, 4, 4), + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=(2, 7, 7), + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + patch_norm=False, + frozen_stages=-1, + use_checkpoint=False): + super().__init__() + + self.pretrained = pretrained + self.pretrained2d = pretrained2d + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.patch_norm = patch_norm + self.frozen_stages = frozen_stages + self.window_size = window_size + self.patch_size = patch_size + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed3D( + patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2 ** i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if i_layer < self.num_layers - 1 else None, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + + # add a norm layer for each output + self.norm = norm_layer(self.num_features) + self._freeze_stages() + self.I3HD = I3DHead(num_classes=400, in_channels=self.num_features) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1: + self.pos_drop.eval() + for i in range(0, self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + # def inflate_weights(self, logger): + # """Inflate the swin2d parameters to swin3d. + # The differences between swin3d and swin2d mainly lie in an extra + # axis. To utilize the pretrained parameters in 2d model, + # the weight of swin2d models should be inflated to fit in the shapes of + # the 3d counterpart. + # Args: + # logger (logging.Logger): The logger used to print + # debugging infomation. + # """ + # checkpoint = torch.load(self.pretrained, map_location='cpu') + # state_dict = checkpoint['model'] + # + # # delete relative_position_index since we always re-init it + # relative_position_index_keys = [k for k in state_dict.keys() if "relative_position_index" in k] + # for k in relative_position_index_keys: + # del state_dict[k] + # + # # delete attn_mask since we always re-init it + # attn_mask_keys = [k for k in state_dict.keys() if "attn_mask" in k] + # for k in attn_mask_keys: + # del state_dict[k] + # + # state_dict['patch_embed.proj.weight'] = state_dict['patch_embed.proj.weight'].unsqueeze(2).repeat(1,1,self.patch_size[0],1,1) / self.patch_size[0] + # + # # bicubic interpolate relative_position_bias_table if not match + # relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k] + # for k in relative_position_bias_table_keys: + # relative_position_bias_table_pretrained = state_dict[k] + # relative_position_bias_table_current = self.state_dict()[k] + # L1, nH1 = relative_position_bias_table_pretrained.size() + # L2, nH2 = relative_position_bias_table_current.size() + # L2 = (2*self.window_size[1]-1) * (2*self.window_size[2]-1) + # wd = self.window_size[0] + # if nH1 != nH2: + # logger.warning(f"Error in loading {k}, passing") + # else: + # if L1 != L2: + # S1 = int(L1 ** 0.5) + # relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate( + # relative_position_bias_table_pretrained.permute(1, 0).view(1, nH1, S1, S1), size=(2*self.window_size[1]-1, 2*self.window_size[2]-1), + # mode='bicubic') + # relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.view(nH2, L2).permute(1, 0) + # state_dict[k] = relative_position_bias_table_pretrained.repeat(2*wd-1,1) + # + # msg = self.load_state_dict(state_dict, strict=False) + # logger.info(msg) + # logger.info(f"=> loaded successfully '{self.pretrained}'") + # del checkpoint + # torch.cuda.empty_cache() + + # def init_weights(self, pretrained=None): + # """Initialize the weights in backbone. + # Args: + # pretrained (str, optional): Path to pre-trained weights. + # Defaults to None. + # """ + # def _init_weights(m): + # if isinstance(m, nn.Linear): + # trunc_normal_(m.weight, std=.02) + # if isinstance(m, nn.Linear) and m.bias is not None: + # nn.init.constant_(m.bias, 0) + # elif isinstance(m, nn.LayerNorm): + # nn.init.constant_(m.bias, 0) + # nn.init.constant_(m.weight, 1.0) + # + # if pretrained: + # self.pretrained = pretrained + # if isinstance(self.pretrained, str): + # self.apply(_init_weights) + # logger = get_root_logger() + # logger.info(f'load model from: {self.pretrained}') + # + # if self.pretrained2d: + # # Inflate 2D model into 3D model. + # self.inflate_weights(logger) + # else: + # # Directly load 3D model. + # load_checkpoint(self, self.pretrained, strict=False, logger=logger) + # elif self.pretrained is None: + # self.apply(_init_weights) + # else: + # raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x) + + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x.contiguous()) + + x = rearrange(x, 'n c d h w -> n d h w c') + x = self.norm(x) + x = rearrange(x, 'n d h w c -> n c d h w') + x = self.I3HD(x) + return x + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer3D, self).train(mode) + self._freeze_stages() diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000000..2a34e2cfcc --- /dev/null +++ b/setup.sh @@ -0,0 +1,7 @@ +#!/bin/bash +conda env create -f environment.yml +conda activate swint +pip install gdown +gdown https://drive.google.com/uc?id=1CC0DwkrJ3Lb-DhHXrmQ8g6mCr3I74umf +cp -r mmaction/* /srv/conda/envs/swint/lib/python3.7/site-packages/mmaction/ +https://drive.google.com/file/d/1z6Wqx2y0rUD_YyAWiwEAD8dgvHvAAgaD/view?usp=sharing \ No newline at end of file diff --git a/swint.py b/swint.py new file mode 100644 index 0000000000..f4ff254e58 --- /dev/null +++ b/swint.py @@ -0,0 +1,92 @@ +""" +On the terminal run: + mkdir weights + cd weights + gdown https://drive.google.com/uc?id=10_ArqSj837hBzoQTq3RPGBZgKbBvNfSe +to use the use_pretrained function with default parameters. +""" +import cv2 +import requests +import torch +from torchvision import transforms +from tqdm import tqdm + +from mmaction.models.recognizers.swintransformer3d import SwinTransformer3D + + +def video2img(video_path: str): + """ + Converts a video to a torch tensor of (channels, frames, height, width). + Args: + video_path: path to the video. + Returns: + torch tensor of (channels, frames, height, width). + """ + vidcap = cv2.VideoCapture(video_path) + success, image = vidcap.read() + count = 0 + l = [] + transform = transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.Normalize( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375] + ), + ]) + while success: + if count % 20 == 0: + l.append( + transform( + torch.tensor(image).type( + torch.FloatTensor).permute(2, 0, 1) + ).unsqueeze(dim=0) + ) + success, image = vidcap.read() + count += 1 + return torch.stack(l, dim=2) + + +def use_pretrained(model, + folder='weights/', + file_name="swint_victim_pretrained.pth", + download=False, url=None, ): + """ + Loads a pretrained model. + Args: + model: model to load the weights to. + folder: folder to load the weights from. + file_name: name of the file to load the weights from. + download: whether to download the weights from the url. + url: url to download the weights from. + Returns: + model with loaded weights. + """ + if download: + response = requests.get(url, stream=True) + t = int(response.headers.get('content-length', 0)) + block_size = 1024 ** 2 + progress_bar = tqdm(total=t, unit='iB', unit_scale=True) + with open(f"weights/{file_name}", 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + if (t != 0) and (progress_bar.n != t): + print("ERROR downloading weights!") + return -1 + print(f"Weights downloaded in {folder} directory!") + model.load_state_dict(torch.load(os.path.join(folder, file_name))) + return model + + +model = SwinTransformer3D() +use_pretrained(model) +# The input must be of the form (batchSize, channels, frames, height, width) +loss_func = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.AdamW( + student_model.parameters(), + lr=0.001, + betas=(0.9, 0.999), + weight_decay=0.02 +)