From 5bb290002d9fdb5d9e9c779ffc27baef8bf2cd67 Mon Sep 17 00:00:00 2001
From: vidit <vidit.agarwal.eee20@itbhu.ac.in>
Date: Sun, 20 Mar 2022 05:57:00 +0530
Subject: [PATCH 1/2] Fixed mmaction folder to make Swin-T working and made the
 installation process much faster and easier

---
 docs/install.md                               |  37 +++--
 mmaction/apis/inference.py                    |  18 +--
 mmaction/apis/train.py                        |  62 ++------
 mmaction/core/__init__.py                     |   2 +-
 mmaction/core/bbox/transforms.py              |  32 ++---
 .../evaluation/ava_evaluation/np_box_list.py  |   7 +-
 .../object_detection_evaluation.py            |  92 +++++++-----
 .../ava_evaluation/per_image_evaluation.py    | 136 +++++++++++-------
 mmaction/core/evaluation/ava_utils.py         | 103 ++++++-------
 mmaction/core/evaluation/eval_hooks.py        |   5 +-
 mmaction/datasets/__init__.py                 |   4 +-
 mmaction/datasets/audio_visual_dataset.py     |   2 +-
 mmaction/datasets/base.py                     |   2 +-
 mmaction/datasets/blending_utils.py           |  28 ++--
 mmaction/datasets/pipelines/__init__.py       |   4 +-
 mmaction/datasets/pipelines/augmentations.py  |  62 ++------
 mmaction/datasets/pipelines/loading.py        |  76 ++++------
 mmaction/datasets/pipelines/pose_loading.py   |   8 +-
 mmaction/datasets/ssn_dataset.py              |   2 +-
 mmaction/models/__init__.py                   |   6 +-
 mmaction/models/backbones/__init__.py         |   4 +-
 mmaction/models/backbones/mobilenet_v2.py     |   8 +-
 mmaction/models/backbones/resnet3d.py         |  22 ++-
 mmaction/models/backbones/resnet3d_csn.py     |   2 +-
 .../models/backbones/resnet3d_slowfast.py     |   2 +-
 mmaction/models/backbones/resnet_audio.py     |   8 +-
 mmaction/models/backbones/tanet.py            |  36 ++---
 mmaction/models/common/lfb.py                 |   2 +-
 mmaction/models/heads/bbox_head.py            |  11 +-
 mmaction/models/heads/lfb_infer_head.py       |   2 +-
 mmaction/models/heads/misc_head.py            |   2 +-
 mmaction/models/recognizers/__init__.py       |   4 +-
 mmaction/models/recognizers/base.py           |  10 +-
 mmaction/models/recognizers/recognizer2d.py   |   4 +-
 mmaction/utils/__init__.py                    |   4 +-
 mmaction/utils/decorators.py                  |   1 +
 mmaction/utils/precise_bn.py                  |   5 +-
 37 files changed, 369 insertions(+), 446 deletions(-)

diff --git a/docs/install.md b/docs/install.md
index 9ba611d3bb..9d77303d2d 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -48,8 +48,25 @@ conda install -y jpeg libtiff
 If mmcv and mmcv-full are both installed, there will be `ModuleNotFoundError`.
 
 ## Prepare environment
-
-a. Create a conda virtual environment and activate it.
+- __*Method 1*__:
+  - This will create the necessary conda environment 
+  ```shell
+   conda env create -f environment.yml
+   conda activate swint
+   ```
+  based upon the path of python you can check this using 
+  ```shell
+    which python
+  ```
+  for example, it's ```/home/vidit/miniconda3/bin/python``` 
+  then run
+  ```shell
+  cp -r mmaction/ /home/vidit/miniconda3/lib/python3.7/site-packages/mmaction/
+  ```
+  #### i.e. just copy the path before ```bin``` in the path of python and add ```/lib/python3.7/site-packages/mmaction/``` to it.
+- __*Method 2*__:
+
+    a. Create a conda virtual environment and activate it.
 
 ```shell
 conda create -n open-mmlab python=3.7 -y
@@ -158,17 +175,17 @@ Note:
 1. The git commit id will be written to the version number with step b, e.g. 0.6.0+2e7045c. The version will also be saved in trained models.
    It is recommended that you run step b each time you pull some updates from github. If C++/CUDA codes are modified, then this step is compulsory.
 
-2. Following the above instructions, MMAction2 is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it (unless you submit some commits and want to update the version number).
+   1. Following the above instructions, MMAction2 is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it (unless you submit some commits and want to update the version number).
 
-3. If you would like to use `opencv-python-headless` instead of `opencv-python`,
-   you can install it before installing MMCV.
+   2. If you would like to use `opencv-python-headless` instead of `opencv-python`,
+      you can install it before installing MMCV.
 
-4. If you would like to use `PyAV`, you can install it with `conda install av -c conda-forge -y`.
+   3. If you would like to use `PyAV`, you can install it with `conda install av -c conda-forge -y`.
 
-5. Some dependencies are optional. Running `python setup.py develop` will only install the minimum runtime requirements.
-   To use optional dependencies like `decord`, either install them with `pip install -r requirements/optional.txt`
-   or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`,
-   valid keys for the `[optional]` field are `all`, `tests`, `build`, and `optional`) like `pip install -v -e .[tests,build]`.
+   4. Some dependencies are optional. Running `python setup.py develop` will only install the minimum runtime requirements.
+      To use optional dependencies like `decord`, either install them with `pip install -r requirements/optional.txt`
+      or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`,
+      valid keys for the `[optional]` field are `all`, `tests`, `build`, and `optional`) like `pip install -v -e .[tests,build]`.
 
 ## Install with CPU only
 
diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py
index e31685b7f5..008e4e3d31 100644
--- a/mmaction/apis/inference.py
+++ b/mmaction/apis/inference.py
@@ -1,6 +1,5 @@
 import os
 import os.path as osp
-import re
 from operator import itemgetter
 
 import mmcv
@@ -107,23 +106,10 @@ def inference_recognizer(model,
         filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg')
         modality = cfg.data.test.get('modality', 'RGB')
         start_index = cfg.data.test.get('start_index', 1)
-
-        # count the number of frames that match the format of `filename_tmpl`
-        # RGB pattern example: img_{:05}.jpg -> ^img_\d+.jpg$
-        # Flow patteren example: {}_{:05d}.jpg -> ^x_\d+.jpg$
-        pattern = f'^{filename_tmpl}$'
-        if modality == 'Flow':
-            pattern = pattern.replace('{}', 'x')
-        pattern = pattern.replace(
-            pattern[pattern.find('{'):pattern.find('}') + 1], '\\d+')
-        total_frames = len(
-            list(
-                filter(lambda x: re.match(pattern, x) is not None,
-                       os.listdir(video_path))))
-
         data = dict(
             frame_dir=video_path,
-            total_frames=total_frames,
+            total_frames=len(os.listdir(video_path)),
+            # assuming files in ``video_path`` are all named with ``filename_tmpl``  # noqa: E501
             label=-1,
             start_index=start_index,
             filename_tmpl=filename_tmpl,
diff --git a/mmaction/apis/train.py b/mmaction/apis/train.py
index 89e89ced98..ef3c1eec54 100644
--- a/mmaction/apis/train.py
+++ b/mmaction/apis/train.py
@@ -1,20 +1,15 @@
 import copy as cp
 import os.path as osp
-
 import torch
 from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
                          build_optimizer, get_dist_info)
 from mmcv.runner.hooks import Fp16OptimizerHook
-
 from ..core import (DistEvalHook, EvalHook, OmniSourceDistSamplerSeedHook,
                     OmniSourceRunner)
 from ..datasets import build_dataloader, build_dataset
 from ..utils import PreciseBNHook, get_root_logger
 from .test import multi_gpu_test
-from mmcv_custom.runner import EpochBasedRunnerAmp
-import apex
-import os.path as osp
 
 
 def train_model(model,
@@ -47,10 +42,8 @@ def train_model(model,
     # prepare data loaders
     dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
 
-    if 'optimizer_config' not in cfg:
-        cfg.optimizer_config={}
     dataloader_setting = dict(
-        videos_per_gpu=cfg.data.get('videos_per_gpu', 1) // cfg.optimizer_config.get('update_interval', 1),
+        videos_per_gpu=cfg.data.get('videos_per_gpu', 1),
         workers_per_gpu=cfg.data.get('workers_per_gpu', 1),
         num_gpus=len(cfg.gpu_ids),
         dist=distributed,
@@ -80,24 +73,6 @@ def train_model(model,
             build_dataloader(ds, **dataloader_setting) for ds in dataset
         ]
 
-    # build runner
-    optimizer = build_optimizer(model, cfg.optimizer)
-    # use apex fp16 optimizer
-    # Noticed that this is just a temporary patch. We shoud not encourage this kind of code style
-    use_amp = False
-    if (
-        cfg.optimizer_config.get("type", None)
-        and cfg.optimizer_config["type"] == "DistOptimizerHook"
-    ):
-        if cfg.optimizer_config.get("use_fp16", False):
-            model, optimizer = apex.amp.initialize(
-                model.cuda(), optimizer, opt_level="O1"
-            )
-            for m in model.modules():
-                if hasattr(m, "fp16_enabled"):
-                    m.fp16_enabled = True
-            use_amp = True
-
     # put model on gpus
     if distributed:
         find_unused_parameters = cfg.get('find_unused_parameters', False)
@@ -112,23 +87,16 @@ def train_model(model,
         model = MMDataParallel(
             model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
 
-    if use_amp:
-        Runner = EpochBasedRunnerAmp
-        runner = Runner(
-            model,
-            optimizer=optimizer,
-            work_dir=cfg.work_dir,
-            logger=logger,
-            meta=meta,
-            amp=use_amp)
-    else:
-        Runner = OmniSourceRunner if cfg.omnisource else EpochBasedRunner
-        runner = Runner(
-            model,
-            optimizer=optimizer,
-            work_dir=cfg.work_dir,
-            logger=logger,
-            meta=meta)
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    Runner = OmniSourceRunner if cfg.omnisource else EpochBasedRunner
+    runner = Runner(
+        model,
+        optimizer=optimizer,
+        work_dir=cfg.work_dir,
+        logger=logger,
+        meta=meta)
     # an ugly workaround to make .log and .log.json filenames the same
     runner.timestamp = timestamp
 
@@ -184,9 +152,7 @@ def train_model(model,
         runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
 
     if cfg.resume_from:
-        runner.resume(cfg.resume_from, resume_amp=use_amp)
-    elif cfg.get("auto_resume", False) and osp.exists(osp.join(runner.work_dir, 'latest.pth')):
-        runner.auto_resume()
+        runner.resume(cfg.resume_from)
     elif cfg.load_from:
         runner.load_checkpoint(cfg.load_from)
     runner_kwargs = dict()
@@ -257,5 +223,5 @@ def train_model(model,
 
                 eval_res = test_dataset.evaluate(outputs, **eval_cfg)
                 runner.logger.info(f'Testing results of the {name} checkpoint')
-                for metric_name, val in eval_res.items():
-                    runner.logger.info(f'{metric_name}: {val:.04f}')
+                for name, val in eval_res.items():
+                    runner.logger.info(f'{name}: {val:.04f}')
diff --git a/mmaction/core/__init__.py b/mmaction/core/__init__.py
index f5f617cdf2..6842299583 100644
--- a/mmaction/core/__init__.py
+++ b/mmaction/core/__init__.py
@@ -1,6 +1,6 @@
 from .bbox import *  # noqa: F401, F403
 from .evaluation import *  # noqa: F401, F403
 from .hooks import *  # noqa: F401, F403
+from .lr import *  # noqa: F401, F403
 from .optimizer import *  # noqa: F401, F403
 from .runner import *  # noqa: F401, F403
-from .scheduler import *  # noqa: F401, F403
diff --git a/mmaction/core/bbox/transforms.py b/mmaction/core/bbox/transforms.py
index b051e2275e..d61e8116d7 100644
--- a/mmaction/core/bbox/transforms.py
+++ b/mmaction/core/bbox/transforms.py
@@ -15,22 +15,22 @@ def bbox2result(bboxes, labels, num_classes, thr=0.01):
     """
     if bboxes.shape[0] == 0:
         return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32))
+    else:
+        bboxes = bboxes.cpu().numpy()
+        labels = labels.cpu().numpy()
 
-    bboxes = bboxes.cpu().numpy()
-    labels = labels.cpu().numpy()
+        # We only handle multilabel now
+        assert labels.shape[-1] > 1
 
-    # We only handle multilabel now
-    assert labels.shape[-1] > 1
+        scores = labels  # rename for clarification
+        thr = (thr, ) * num_classes if isinstance(thr, float) else thr
+        assert scores.shape[1] == num_classes
+        assert len(thr) == num_classes
 
-    scores = labels  # rename for clarification
-    thr = (thr, ) * num_classes if isinstance(thr, float) else thr
-    assert scores.shape[1] == num_classes
-    assert len(thr) == num_classes
-
-    result = []
-    for i in range(num_classes - 1):
-        where = scores[:, i + 1] > thr[i + 1]
-        result.append(
-            np.concatenate((bboxes[where, :4], scores[where, i + 1:i + 2]),
-                           axis=1))
-    return result
+        result = []
+        for i in range(num_classes - 1):
+            where = scores[:, i + 1] > thr[i + 1]
+            result.append(
+                np.concatenate((bboxes[where, :4], scores[where, i + 1:i + 2]),
+                               axis=1))
+        return result
diff --git a/mmaction/core/evaluation/ava_evaluation/np_box_list.py b/mmaction/core/evaluation/ava_evaluation/np_box_list.py
index ddfdd5184d..f9b101e6f5 100644
--- a/mmaction/core/evaluation/ava_evaluation/np_box_list.py
+++ b/mmaction/core/evaluation/ava_evaluation/np_box_list.py
@@ -120,9 +120,8 @@ def get_coordinates(self):
         x_max = box_coordinates[:, 3]
         return [y_min, x_min, y_max, x_max]
 
-    @staticmethod
-    def _is_valid_boxes(data):
-        """Check whether data fulfills the format of N*[ymin, xmin, ymax,
+    def _is_valid_boxes(self, data):
+        """Check whether data fullfills the format of N*[ymin, xmin, ymax,
         xmin].
 
         Args:
@@ -132,7 +131,7 @@ def _is_valid_boxes(data):
             a boolean indicating whether all ymax of boxes are equal or greater
             than ymin, and all xmax of boxes are equal or greater than xmin.
         """
-        if len(data) != 0:
+        if len(data):
             for v in data:
                 if v[0] > v[2] or v[1] > v[3]:
                     return False
diff --git a/mmaction/core/evaluation/ava_evaluation/object_detection_evaluation.py b/mmaction/core/evaluation/ava_evaluation/object_detection_evaluation.py
index 508a076def..95f0cc501c 100644
--- a/mmaction/core/evaluation/ava_evaluation/object_detection_evaluation.py
+++ b/mmaction/core/evaluation/ava_evaluation/object_detection_evaluation.py
@@ -29,7 +29,6 @@
 
 import collections
 import logging
-import warnings
 from abc import ABCMeta, abstractmethod
 from collections import defaultdict
 
@@ -102,13 +101,15 @@ def clear(self):
 class ObjectDetectionEvaluator(DetectionEvaluator):
     """A class to evaluate detections."""
 
-    def __init__(self,
-                 categories,
-                 matching_iou_threshold=0.5,
-                 evaluate_corlocs=False,
-                 metric_prefix=None,
-                 use_weighted_mean_ap=False,
-                 evaluate_masks=False):
+    def __init__(
+        self,
+        categories,
+        matching_iou_threshold=0.5,
+        evaluate_corlocs=False,
+        metric_prefix=None,
+        use_weighted_mean_ap=False,
+        evaluate_masks=False,
+    ):
         """Constructor.
 
         Args:
@@ -243,8 +244,7 @@ def add_single_detected_image_info(self, image_id, detections_dict):
             detected_masks=detection_masks,
         )
 
-    @staticmethod
-    def create_category_index(categories):
+    def create_category_index(self, categories):
         """Creates dictionary of COCO compatible categories keyed by category
         id.
 
@@ -277,8 +277,14 @@ def evaluate(self):
             2. per_category_ap: category specific results with keys of the form
                'PerformanceByCategory/mAP@<matching_iou_threshold>IOU/category'
         """
-        (per_class_ap, mean_ap, _, _, per_class_corloc,
-         mean_corloc) = self._evaluation.evaluate()
+        (
+            per_class_ap,
+            mean_ap,
+            _,
+            _,
+            per_class_corloc,
+            mean_corloc,
+        ) = self._evaluation.evaluate()
 
         metric = f'mAP@{self._matching_iou_threshold}IOU'
         pascal_metrics = {self._metric_prefix + metric: mean_ap}
@@ -349,13 +355,15 @@ def __init__(self, categories, matching_iou_threshold=0.5):
 class ObjectDetectionEvaluation:
     """Internal implementation of Pascal object detection metrics."""
 
-    def __init__(self,
-                 num_groundtruth_classes,
-                 matching_iou_threshold=0.5,
-                 nms_iou_threshold=1.0,
-                 nms_max_output_boxes=10000,
-                 use_weighted_mean_ap=False,
-                 label_id_offset=0):
+    def __init__(
+        self,
+        num_groundtruth_classes,
+        matching_iou_threshold=0.5,
+        nms_iou_threshold=1.0,
+        nms_max_output_boxes=10000,
+        use_weighted_mean_ap=False,
+        label_id_offset=0,
+    ):
         if num_groundtruth_classes < 1:
             raise ValueError(
                 'Need at least 1 groundtruth class for evaluation.')
@@ -391,11 +399,13 @@ def _initialize_detections(self):
     def clear_detections(self):
         self._initialize_detections()
 
-    def add_single_ground_truth_image_info(self,
-                                           image_key,
-                                           groundtruth_boxes,
-                                           groundtruth_class_labels,
-                                           groundtruth_masks=None):
+    def add_single_ground_truth_image_info(
+        self,
+        image_key,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        groundtruth_masks=None,
+    ):
         """Adds groundtruth for a single image to be used for evaluation.
 
         Args:
@@ -410,8 +420,8 @@ def add_single_ground_truth_image_info(self,
                 masks. The mask values range from 0 to 1.
         """
         if image_key in self.groundtruth_boxes:
-            warnings.warn(('image %s has already been added to the ground '
-                           'truth database.'), image_key)
+            logging.warn(('image %s has already been added to the ground '
+                          'truth database.'), image_key)
             return
 
         self.groundtruth_boxes[image_key] = groundtruth_boxes
@@ -420,12 +430,14 @@ def add_single_ground_truth_image_info(self,
 
         self._update_ground_truth_statistics(groundtruth_class_labels)
 
-    def add_single_detected_image_info(self,
-                                       image_key,
-                                       detected_boxes,
-                                       detected_scores,
-                                       detected_class_labels,
-                                       detected_masks=None):
+    def add_single_detected_image_info(
+        self,
+        image_key,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        detected_masks=None,
+    ):
         """Adds detections for a single image to be used for evaluation.
 
         Args:
@@ -456,8 +468,8 @@ def add_single_detected_image_info(self,
             )
 
         if image_key in self.detection_keys:
-            warnings.warn(('image %s has already been added to the ground '
-                           'truth database.'), image_key)
+            logging.warn(('image %s has already been added to the ground '
+                          'truth database.'), image_key)
             return
 
         self.detection_keys.add(image_key)
@@ -524,7 +536,8 @@ def evaluate(self):
             logging.info(
                 'The following classes have no ground truth examples: %s',
                 np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +
-                self.label_id_offset)
+                self.label_id_offset,
+            )
 
         if self.use_weighted_mean_ap:
             all_scores = np.array([], dtype=float)
@@ -544,8 +557,10 @@ def evaluate(self):
                 all_scores = np.append(all_scores, scores)
                 all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)
             precision, recall = metrics.compute_precision_recall(
-                scores, tp_fp_labels,
-                self.num_gt_instances_per_class[class_index])
+                scores,
+                tp_fp_labels,
+                self.num_gt_instances_per_class[class_index],
+            )
             self.precisions_per_class.append(precision)
             self.recalls_per_class.append(recall)
             average_precision = metrics.compute_average_precision(
@@ -554,7 +569,8 @@ def evaluate(self):
 
         self.corloc_per_class = metrics.compute_cor_loc(
             self.num_gt_imgs_per_class,
-            self.num_images_correctly_detected_per_class)
+            self.num_images_correctly_detected_per_class,
+        )
 
         if self.use_weighted_mean_ap:
             num_gt_instances = np.sum(self.num_gt_instances_per_class)
diff --git a/mmaction/core/evaluation/ava_evaluation/per_image_evaluation.py b/mmaction/core/evaluation/ava_evaluation/per_image_evaluation.py
index 2d06672d89..6265c17d7a 100644
--- a/mmaction/core/evaluation/ava_evaluation/per_image_evaluation.py
+++ b/mmaction/core/evaluation/ava_evaluation/per_image_evaluation.py
@@ -40,14 +40,16 @@ def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5):
         self.matching_iou_threshold = matching_iou_threshold
         self.num_groundtruth_classes = num_groundtruth_classes
 
-    def compute_object_detection_metrics(self,
-                                         detected_boxes,
-                                         detected_scores,
-                                         detected_class_labels,
-                                         groundtruth_boxes,
-                                         groundtruth_class_labels,
-                                         detected_masks=None,
-                                         groundtruth_masks=None):
+    def compute_object_detection_metrics(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
         """Evaluates detections as being tp, fp or ignored from a single image.
 
         The evaluation is done in two stages:
@@ -103,14 +105,16 @@ def compute_object_detection_metrics(self,
 
         return scores, tp_fp_labels
 
-    def _compute_tp_fp(self,
-                       detected_boxes,
-                       detected_scores,
-                       detected_class_labels,
-                       groundtruth_boxes,
-                       groundtruth_class_labels,
-                       detected_masks=None,
-                       groundtruth_masks=None):
+    def _compute_tp_fp(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
         """Labels true/false positives of detections of an image across all
         classes.
 
@@ -155,12 +159,18 @@ def _compute_tp_fp(self,
         result_scores = []
         result_tp_fp_labels = []
         for i in range(self.num_groundtruth_classes):
-            (gt_boxes_at_ith_class, gt_masks_at_ith_class,
-             detected_boxes_at_ith_class, detected_scores_at_ith_class,
-             detected_masks_at_ith_class) = self._get_ith_class_arrays(
-                 detected_boxes, detected_scores, detected_masks,
-                 detected_class_labels, groundtruth_boxes, groundtruth_masks,
-                 groundtruth_class_labels, i)
+            (
+                gt_boxes_at_ith_class,
+                gt_masks_at_ith_class,
+                detected_boxes_at_ith_class,
+                detected_scores_at_ith_class,
+                detected_masks_at_ith_class,
+            ) = self._get_ith_class_arrays(detected_boxes, detected_scores,
+                                           detected_masks,
+                                           detected_class_labels,
+                                           groundtruth_boxes,
+                                           groundtruth_masks,
+                                           groundtruth_class_labels, i)
             scores, tp_fp_labels = self._compute_tp_fp_for_single_class(
                 detected_boxes=detected_boxes_at_ith_class,
                 detected_scores=detected_scores_at_ith_class,
@@ -172,9 +182,8 @@ def _compute_tp_fp(self,
             result_tp_fp_labels.append(tp_fp_labels)
         return result_scores, result_tp_fp_labels
 
-    @staticmethod
-    def _get_overlaps_and_scores_box_mode(detected_boxes, detected_scores,
-                                          groundtruth_boxes):
+    def _get_overlaps_and_scores_box_mode(self, detected_boxes,
+                                          detected_scores, groundtruth_boxes):
         """Computes overlaps and scores between detected and groudntruth boxes.
 
         Args:
@@ -205,12 +214,14 @@ def _get_overlaps_and_scores_box_mode(detected_boxes, detected_scores,
         num_boxes = detected_boxlist.num_boxes()
         return iou, None, scores, num_boxes
 
-    def _compute_tp_fp_for_single_class(self,
-                                        detected_boxes,
-                                        detected_scores,
-                                        groundtruth_boxes,
-                                        detected_masks=None,
-                                        groundtruth_masks=None):
+    def _compute_tp_fp_for_single_class(
+        self,
+        detected_boxes,
+        detected_scores,
+        groundtruth_boxes,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
         """Labels boxes detected with the same class from the same image as
         tp/fp.
 
@@ -238,11 +249,15 @@ def _compute_tp_fp_for_single_class(self,
         if detected_boxes.size == 0:
             return np.array([], dtype=float), np.array([], dtype=bool)
 
-        (iou, _, scores,
-         num_detected_boxes) = self._get_overlaps_and_scores_box_mode(
-             detected_boxes=detected_boxes,
-             detected_scores=detected_scores,
-             groundtruth_boxes=groundtruth_boxes)
+        (
+            iou,
+            _,
+            scores,
+            num_detected_boxes,
+        ) = self._get_overlaps_and_scores_box_mode(
+            detected_boxes=detected_boxes,
+            detected_scores=detected_scores,
+            groundtruth_boxes=groundtruth_boxes)
 
         if groundtruth_boxes.size == 0:
             return scores, np.zeros(num_detected_boxes, dtype=bool)
@@ -267,11 +282,17 @@ def _compute_tp_fp_for_single_class(self,
 
         return scores, tp_fp_labels
 
-    @staticmethod
-    def _get_ith_class_arrays(detected_boxes, detected_scores, detected_masks,
-                              detected_class_labels, groundtruth_boxes,
-                              groundtruth_masks, groundtruth_class_labels,
-                              class_index):
+    def _get_ith_class_arrays(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_masks,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_masks,
+        groundtruth_class_labels,
+        class_index,
+    ):
         """Returns numpy arrays belonging to class with index `class_index`.
 
         Args:
@@ -311,15 +332,21 @@ class labels.
             detected_masks_at_ith_class = detected_masks[selected_detections]
         else:
             detected_masks_at_ith_class = None
-        return (gt_boxes_at_ith_class, gt_masks_at_ith_class,
-                detected_boxes_at_ith_class, detected_scores_at_ith_class,
-                detected_masks_at_ith_class)
-
-    @staticmethod
-    def _remove_invalid_boxes(detected_boxes,
-                              detected_scores,
-                              detected_class_labels,
-                              detected_masks=None):
+        return (
+            gt_boxes_at_ith_class,
+            gt_masks_at_ith_class,
+            detected_boxes_at_ith_class,
+            detected_scores_at_ith_class,
+            detected_masks_at_ith_class,
+        )
+
+    def _remove_invalid_boxes(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        detected_masks=None,
+    ):
         """Removes entries with invalid boxes.
 
         A box is invalid if either its xmax is smaller than its xmin, or its
@@ -346,13 +373,16 @@ def _remove_invalid_boxes(detected_boxes,
         """
         valid_indices = np.logical_and(
             detected_boxes[:, 0] < detected_boxes[:, 2],
-            detected_boxes[:, 1] < detected_boxes[:, 3])
+            detected_boxes[:, 1] < detected_boxes[:, 3],
+        )
         detected_boxes = detected_boxes[valid_indices]
         detected_scores = detected_scores[valid_indices]
         detected_class_labels = detected_class_labels[valid_indices]
         if detected_masks is not None:
             detected_masks = detected_masks[valid_indices]
         return [
-            detected_boxes, detected_scores, detected_class_labels,
-            detected_masks
+            detected_boxes,
+            detected_scores,
+            detected_class_labels,
+            detected_masks,
         ]
diff --git a/mmaction/core/evaluation/ava_utils.py b/mmaction/core/evaluation/ava_utils.py
index 159297fb7d..01036b85f9 100644
--- a/mmaction/core/evaluation/ava_utils.py
+++ b/mmaction/core/evaluation/ava_utils.py
@@ -35,14 +35,14 @@ def results2csv(dataset, results, out_file, custom_classes=None):
         csv_results = det2csv(dataset, results, custom_classes)
 
     # save space for float
-    def to_str(item):
+    def tostr(item):
         if isinstance(item, float):
             return f'{item:.3f}'
         return str(item)
 
     with open(out_file, 'w') as f:
         for csv_result in csv_results:
-            f.write(','.join(map(to_str, csv_result)))
+            f.write(','.join(map(lambda x: tostr(x), csv_result)))
             f.write('\n')
 
 
@@ -157,6 +157,7 @@ def ava_eval(result_file,
              label_file,
              ann_file,
              exclude_file,
+             max_dets=(100, ),
              verbose=True,
              custom_classes=None):
 
@@ -185,52 +186,52 @@ def ava_eval(result_file,
     if verbose:
         print_time('Reading detection results', start)
 
-    # Evaluation for mAP
-    pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)
-
-    start = time.time()
-    for image_key in gt_boxes:
-        if verbose and image_key in excluded_keys:
-            logging.info(
-                'Found excluded timestamp in detections: %s.'
-                'It will be ignored.', image_key)
-            continue
-        pascal_evaluator.add_single_ground_truth_image_info(
-            image_key, {
-                standard_fields.InputDataFields.groundtruth_boxes:
-                np.array(gt_boxes[image_key], dtype=float),
-                standard_fields.InputDataFields.groundtruth_classes:
-                np.array(gt_labels[image_key], dtype=int)
-            })
-    if verbose:
-        print_time('Convert groundtruth', start)
-
-    start = time.time()
-    for image_key in boxes:
-        if verbose and image_key in excluded_keys:
-            logging.info(
-                'Found excluded timestamp in detections: %s.'
-                'It will be ignored.', image_key)
-            continue
-        pascal_evaluator.add_single_detected_image_info(
-            image_key, {
-                standard_fields.DetectionResultFields.detection_boxes:
-                np.array(boxes[image_key], dtype=float),
-                standard_fields.DetectionResultFields.detection_classes:
-                np.array(labels[image_key], dtype=int),
-                standard_fields.DetectionResultFields.detection_scores:
-                np.array(scores[image_key], dtype=float)
-            })
-    if verbose:
-        print_time('convert detections', start)
-
-    start = time.time()
-    metrics = pascal_evaluator.evaluate()
-    if verbose:
-        print_time('run_evaluator', start)
-    for display_name in metrics:
-        print(f'{display_name}=\t{metrics[display_name]}')
-    return {
-        display_name: metrics[display_name]
-        for display_name in metrics if 'ByCategory' not in display_name
-    }
+    if result_type == 'mAP':
+        pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)
+
+        start = time.time()
+        for image_key in gt_boxes:
+            if verbose and image_key in excluded_keys:
+                logging.info(
+                    'Found excluded timestamp in detections: %s.'
+                    'It will be ignored.', image_key)
+                continue
+            pascal_evaluator.add_single_ground_truth_image_info(
+                image_key, {
+                    standard_fields.InputDataFields.groundtruth_boxes:
+                    np.array(gt_boxes[image_key], dtype=float),
+                    standard_fields.InputDataFields.groundtruth_classes:
+                    np.array(gt_labels[image_key], dtype=int)
+                })
+        if verbose:
+            print_time('Convert groundtruth', start)
+
+        start = time.time()
+        for image_key in boxes:
+            if verbose and image_key in excluded_keys:
+                logging.info(
+                    'Found excluded timestamp in detections: %s.'
+                    'It will be ignored.', image_key)
+                continue
+            pascal_evaluator.add_single_detected_image_info(
+                image_key, {
+                    standard_fields.DetectionResultFields.detection_boxes:
+                    np.array(boxes[image_key], dtype=float),
+                    standard_fields.DetectionResultFields.detection_classes:
+                    np.array(labels[image_key], dtype=int),
+                    standard_fields.DetectionResultFields.detection_scores:
+                    np.array(scores[image_key], dtype=float)
+                })
+        if verbose:
+            print_time('convert detections', start)
+
+        start = time.time()
+        metrics = pascal_evaluator.evaluate()
+        if verbose:
+            print_time('run_evaluator', start)
+        for display_name in metrics:
+            print(f'{display_name}=\t{metrics[display_name]}')
+        return {
+            display_name: metrics[display_name]
+            for display_name in metrics if 'ByCategory' not in display_name
+        }
diff --git a/mmaction/core/evaluation/eval_hooks.py b/mmaction/core/evaluation/eval_hooks.py
index 9ef5a8ad34..d96ad87a6b 100644
--- a/mmaction/core/evaluation/eval_hooks.py
+++ b/mmaction/core/evaluation/eval_hooks.py
@@ -4,6 +4,7 @@
 from math import inf
 
 import torch.distributed as dist
+from mmcv.runner import Hook
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.utils.data import DataLoader
 
@@ -39,8 +40,6 @@ def __init__(self, *args, save_best='auto', **kwargs):
 
 if not from_mmcv:
 
-    from mmcv.runner import Hook
-
     class EvalHook(Hook):  # noqa: F811
         """Non-Distributed evaluation hook.
 
@@ -363,7 +362,7 @@ def _do_evaluate(self, runner):
             # of rank 0 to other ranks to avoid this.
             if self.broadcast_bn_buffer:
                 model = runner.model
-                for _, module in model.named_modules():
+                for name, module in model.named_modules():
                     if isinstance(module,
                                   _BatchNorm) and module.track_running_stats:
                         dist.broadcast(module.running_var, 0)
diff --git a/mmaction/datasets/__init__.py b/mmaction/datasets/__init__.py
index c7e23fe252..c4b10e53e5 100644
--- a/mmaction/datasets/__init__.py
+++ b/mmaction/datasets/__init__.py
@@ -5,7 +5,7 @@
 from .ava_dataset import AVADataset
 from .base import BaseDataset
 from .blending_utils import (BaseMiniBatchBlending, CutmixBlending,
-                             MixupBlending, LabelSmoothing)
+                             MixupBlending)
 from .builder import (BLENDINGS, DATASETS, PIPELINES, build_dataloader,
                       build_dataset)
 from .dataset_wrappers import RepeatDataset
@@ -22,6 +22,6 @@
     'RawframeDataset', 'BaseDataset', 'ActivityNetDataset', 'SSNDataset',
     'HVUDataset', 'AudioDataset', 'AudioFeatureDataset', 'ImageDataset',
     'RawVideoDataset', 'AVADataset', 'AudioVisualDataset',
-    'BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'LabelSmoothing', 'DATASETS',
+    'BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'DATASETS',
     'PIPELINES', 'BLENDINGS', 'PoseDataset'
 ]
diff --git a/mmaction/datasets/audio_visual_dataset.py b/mmaction/datasets/audio_visual_dataset.py
index e3d5fabfbf..6e10b4b040 100644
--- a/mmaction/datasets/audio_visual_dataset.py
+++ b/mmaction/datasets/audio_visual_dataset.py
@@ -65,7 +65,7 @@ def load_annotations(self):
                     idx += 1
                 # idx for label[s]
                 label = [int(x) for x in line_split[idx:]]
-                assert len(label) != 0, f'missing label in line: {line}'
+                assert len(label), f'missing label in line: {line}'
                 if self.multi_class:
                     assert self.num_classes is not None
                     video_info['label'] = label
diff --git a/mmaction/datasets/base.py b/mmaction/datasets/base.py
index e4f753388c..62fe34f214 100644
--- a/mmaction/datasets/base.py
+++ b/mmaction/datasets/base.py
@@ -90,7 +90,7 @@ def __init__(self,
             self.video_infos_by_class = self.parse_by_class()
 
             class_prob = []
-            for _, samples in self.video_infos_by_class.items():
+            for k, samples in self.video_infos_by_class.items():
                 class_prob.append(len(samples) / len(self.video_infos))
             class_prob = [x**self.power for x in class_prob]
 
diff --git a/mmaction/datasets/blending_utils.py b/mmaction/datasets/blending_utils.py
index 64fdcf7eec..8ef35b0e73 100644
--- a/mmaction/datasets/blending_utils.py
+++ b/mmaction/datasets/blending_utils.py
@@ -6,20 +6,14 @@
 
 from .builder import BLENDINGS
 
-__all__ = ['BaseMiniBatchBlending', 'MixupBlending', 'CutmixBlending', 'LabelSmoothing']
-
-def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'):
-    x = x.long().view(-1, 1)
-    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value)
+__all__ = ['BaseMiniBatchBlending', 'MixupBlending', 'CutmixBlending']
 
 
 class BaseMiniBatchBlending(metaclass=ABCMeta):
     """Base class for Image Aliasing."""
 
-    def __init__(self, num_classes, smoothing=0.):
+    def __init__(self, num_classes):
         self.num_classes = num_classes
-        self.off_value = smoothing / self.num_classes
-        self.on_value = 1. - smoothing + self.off_value
 
     @abstractmethod
     def do_blending(self, imgs, label, **kwargs):
@@ -53,7 +47,7 @@ def __call__(self, imgs, label, **kwargs):
                 the shape of (B, 1, num_classes) and all elements are in range
                 [0, 1].
         """
-        one_hot_label = one_hot(label, num_classes=self.num_classes, on_value=self.on_value, off_value=self.off_value, device=label.device)
+        one_hot_label = F.one_hot(label, num_classes=self.num_classes)
 
         mixed_imgs, mixed_label = self.do_blending(imgs, one_hot_label,
                                                    **kwargs)
@@ -74,8 +68,8 @@ class MixupBlending(BaseMiniBatchBlending):
         alpha (float): Parameters for Beta distribution.
     """
 
-    def __init__(self, num_classes, alpha=.2, smoothing=0.):
-        super().__init__(num_classes=num_classes, smoothing=smoothing)
+    def __init__(self, num_classes, alpha=.2):
+        super().__init__(num_classes=num_classes)
         self.beta = Beta(alpha, alpha)
 
     def do_blending(self, imgs, label, **kwargs):
@@ -95,16 +89,18 @@ def do_blending(self, imgs, label, **kwargs):
 @BLENDINGS.register_module()
 class CutmixBlending(BaseMiniBatchBlending):
     """Implementing Cutmix in a mini-batch.
+
     This module is proposed in `CutMix: Regularization Strategy to Train Strong
     Classifiers with Localizable Features <https://arxiv.org/abs/1905.04899>`_.
     Code Reference https://github.com/clovaai/CutMix-PyTorch
+
     Args:
         num_classes (int): The number of classes.
         alpha (float): Parameters for Beta distribution.
     """
 
-    def __init__(self, num_classes, alpha=.2, smoothing=0.):
-        super().__init__(num_classes=num_classes, smoothing=smoothing)
+    def __init__(self, num_classes, alpha=.2):
+        super().__init__(num_classes=num_classes)
         self.beta = Beta(alpha, alpha)
 
     @staticmethod
@@ -144,9 +140,3 @@ def do_blending(self, imgs, label, **kwargs):
         label = lam * label + (1 - lam) * label[rand_index, :]
 
         return imgs, label
-
-
-@BLENDINGS.register_module()
-class LabelSmoothing(BaseMiniBatchBlending):
-    def do_blending(self, imgs, label, **kwargs):
-        return imgs, label
diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py
index ee71544de3..8a15583ce9 100644
--- a/mmaction/datasets/pipelines/__init__.py
+++ b/mmaction/datasets/pipelines/__init__.py
@@ -1,7 +1,7 @@
 from .augmentations import (AudioAmplify, CenterCrop, ColorJitter,
                             EntityBoxCrop, EntityBoxFlip, EntityBoxRescale,
                             Flip, Fuse, Imgaug, MelSpectrogram, MultiGroupCrop,
-                            MultiScaleCrop, Normalize, RandomCrop, RandomErasing,
+                            MultiScaleCrop, Normalize, RandomCrop,
                             RandomRescale, RandomResizedCrop, RandomScale,
                             Resize, TenCrop, ThreeCrop)
 from .compose import Compose
@@ -21,7 +21,7 @@
 
 __all__ = [
     'SampleFrames', 'PyAVDecode', 'DecordDecode', 'DenseSampleFrames',
-    'OpenCVDecode', 'FrameSelector', 'MultiGroupCrop', 'MultiScaleCrop', 'RandomErasing',
+    'OpenCVDecode', 'FrameSelector', 'MultiGroupCrop', 'MultiScaleCrop',
     'RandomResizedCrop', 'RandomCrop', 'Resize', 'Flip', 'Fuse', 'Normalize',
     'ThreeCrop', 'CenterCrop', 'TenCrop', 'ImageToTensor', 'Transpose',
     'Collect', 'FormatShape', 'Compose', 'ToTensor', 'ToDataContainer',
diff --git a/mmaction/datasets/pipelines/augmentations.py b/mmaction/datasets/pipelines/augmentations.py
index 94f0a896ef..839fb115aa 100644
--- a/mmaction/datasets/pipelines/augmentations.py
+++ b/mmaction/datasets/pipelines/augmentations.py
@@ -5,8 +5,6 @@
 import mmcv
 import numpy as np
 from torch.nn.modules.utils import _pair
-import timm.data as tdata
-import torch
 
 from ..builder import PIPELINES
 
@@ -276,8 +274,7 @@ def __init__(self, transforms):
             self.aug = iaa.Sequential(
                 [self.imgaug_builder(t) for t in self.transforms])
 
-    @staticmethod
-    def default_transforms():
+    def default_transforms(self):
         """Default transforms for imgaug.
 
         Implement RandAugment by imgaug.
@@ -330,8 +327,8 @@ def default_transforms():
                         type='Cutout',
                         nb_iterations=1,
                         size=0.2 * cur_level,
-                        squared=True)
-                ])
+                        squared=True),
+                ]),
         ]
 
     def imgaug_builder(self, cfg):
@@ -424,41 +421,6 @@ def __call__(self, results):
 
         return results
 
-@PIPELINES.register_module()
-class RandomErasing(tdata.random_erasing.RandomErasing):
-    def __init__(self, device='cpu', **args):
-        super().__init__(device=device, **args)
-
-    def __call__(self, results):
-        in_type = results['imgs'][0].dtype.type
-
-        rand_state = random.getstate()
-        torchrand_state = torch.get_rng_state()
-        numpyrand_state = np.random.get_state()
-        # not using cuda to preserve the determiness
-
-        out_frame = []
-        for frame in results['imgs']:
-            random.setstate(rand_state)
-            torch.set_rng_state(torchrand_state)
-            np.random.set_state(numpyrand_state)
-            frame = super().__call__(torch.from_numpy(frame).permute(2, 0, 1)).permute(1, 2, 0).numpy()
-            out_frame.append(frame)
-
-        results['imgs'] = out_frame
-        img_h, img_w, _ = results['imgs'][0].shape
-
-        out_type = results['imgs'][0].dtype.type
-        assert in_type == out_type, \
-            ('Timmaug input dtype and output dtype are not the same. ',
-             f'Convert from {in_type} to {out_type}')
-
-        if 'gt_bboxes' in results:
-            raise NotImplementedError('only support recognition now')
-        assert results['img_shape'] == (img_h, img_w)
-
-        return results
-
 
 @PIPELINES.register_module()
 class Fuse:
@@ -592,17 +554,14 @@ def __init__(self, size, lazy=False):
         self.size = size
         self.lazy = lazy
 
-    @staticmethod
-    def _crop_kps(kps, crop_bbox):
+    def _crop_kps(self, kps, crop_bbox):
         return kps - crop_bbox[:2]
 
-    @staticmethod
-    def _crop_imgs(imgs, crop_bbox):
+    def _crop_imgs(self, imgs, crop_bbox):
         x1, y1, x2, y2 = crop_bbox
         return [img[y1:y2, x1:x2] for img in imgs]
 
-    @staticmethod
-    def _box_crop(box, crop_bbox):
+    def _box_crop(self, box, crop_bbox):
         """Crop the bounding boxes according to the crop_bbox.
 
         Args:
@@ -1110,12 +1069,10 @@ def _resize_imgs(self, imgs, new_w, new_h):
             for img in imgs
         ]
 
-    @staticmethod
-    def _resize_kps(kps, scale_factor):
+    def _resize_kps(self, kps, scale_factor):
         return kps * scale_factor
 
-    @staticmethod
-    def _box_resize(box, scale_factor):
+    def _box_resize(self, box, scale_factor):
         """Rescale the bounding boxes according to the scale_factor.
 
         Args:
@@ -1307,8 +1264,7 @@ def _flip_kps(self, kps, kpscores, img_width):
             kpscores = kpscores[:, :, new_order]
         return kps, kpscores
 
-    @staticmethod
-    def _box_flip(box, img_width):
+    def _box_flip(self, box, img_width):
         """Flip the bounding boxes given the width of the image.
 
         Args:
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 9e03cc6fe2..69e1d38e56 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -12,7 +12,6 @@
 
 from ...utils import get_random_string, get_shm_dir, get_thread_id
 from ..builder import PIPELINES
-import random
 
 
 @PIPELINES.register_module()
@@ -110,8 +109,7 @@ def __init__(self,
                  twice_sample=False,
                  out_of_bound_opt='loop',
                  test_mode=False,
-                 start_index=None,
-                 frame_uniform=False):
+                 start_index=None):
 
         self.clip_len = clip_len
         self.frame_interval = frame_interval
@@ -120,7 +118,6 @@ def __init__(self,
         self.twice_sample = twice_sample
         self.out_of_bound_opt = out_of_bound_opt
         self.test_mode = test_mode
-        self.frame_uniform = frame_uniform
         assert self.out_of_bound_opt in ['loop', 'repeat_last']
 
         if start_index is not None:
@@ -202,27 +199,6 @@ def _sample_clips(self, num_frames):
 
         return clip_offsets
 
-    def get_seq_frames(self, num_frames):
-        """
-        Modified from https://github.com/facebookresearch/SlowFast/blob/64abcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159
-        Given the video index, return the list of sampled frame indexes.
-        Args:
-            num_frames (int): Total number of frame in the video.
-        Returns:
-            seq (list): the indexes of frames of sampled from the video.
-        """
-        seg_size = float(num_frames - 1) / self.clip_len
-        seq = []
-        for i in range(self.clip_len):
-            start = int(np.round(seg_size * i))
-            end = int(np.round(seg_size * (i + 1)))
-            if not self.test_mode:
-                seq.append(random.randint(start, end))
-            else:
-                seq.append((start + end) // 2)
-
-        return np.array(seq)
-
     def __call__(self, results):
         """Perform the SampleFrames loading.
 
@@ -231,35 +207,31 @@ def __call__(self, results):
                 to the next transform in pipeline.
         """
         total_frames = results['total_frames']
-        if self.frame_uniform:  # sthv2 sampling strategy
-            assert results['start_index'] == 0
-            frame_inds = self.get_seq_frames(total_frames)
-        else:
-            clip_offsets = self._sample_clips(total_frames)
-            frame_inds = clip_offsets[:, None] + np.arange(
-                self.clip_len)[None, :] * self.frame_interval
-            frame_inds = np.concatenate(frame_inds)
-
-            if self.temporal_jitter:
-                perframe_offsets = np.random.randint(
-                    self.frame_interval, size=len(frame_inds))
-                frame_inds += perframe_offsets
-
-            frame_inds = frame_inds.reshape((-1, self.clip_len))
-            if self.out_of_bound_opt == 'loop':
-                frame_inds = np.mod(frame_inds, total_frames)
-            elif self.out_of_bound_opt == 'repeat_last':
-                safe_inds = frame_inds < total_frames
-                unsafe_inds = 1 - safe_inds
-                last_ind = np.max(safe_inds * frame_inds, axis=1)
-                new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
-                frame_inds = new_inds
-            else:
-                raise ValueError('Illegal out_of_bound option.')
 
-            start_index = results['start_index']
-            frame_inds = np.concatenate(frame_inds) + start_index
+        clip_offsets = self._sample_clips(total_frames)
+        frame_inds = clip_offsets[:, None] + np.arange(
+            self.clip_len)[None, :] * self.frame_interval
+        frame_inds = np.concatenate(frame_inds)
+
+        if self.temporal_jitter:
+            perframe_offsets = np.random.randint(
+                self.frame_interval, size=len(frame_inds))
+            frame_inds += perframe_offsets
+
+        frame_inds = frame_inds.reshape((-1, self.clip_len))
+        if self.out_of_bound_opt == 'loop':
+            frame_inds = np.mod(frame_inds, total_frames)
+        elif self.out_of_bound_opt == 'repeat_last':
+            safe_inds = frame_inds < total_frames
+            unsafe_inds = 1 - safe_inds
+            last_ind = np.max(safe_inds * frame_inds, axis=1)
+            new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
+            frame_inds = new_inds
+        else:
+            raise ValueError('Illegal out_of_bound option.')
 
+        start_index = results['start_index']
+        frame_inds = np.concatenate(frame_inds) + start_index
         results['frame_inds'] = frame_inds.astype(np.int)
         results['clip_len'] = self.clip_len
         results['frame_interval'] = self.frame_interval
diff --git a/mmaction/datasets/pipelines/pose_loading.py b/mmaction/datasets/pipelines/pose_loading.py
index ae198d42ed..9c19e25427 100644
--- a/mmaction/datasets/pipelines/pose_loading.py
+++ b/mmaction/datasets/pipelines/pose_loading.py
@@ -146,8 +146,7 @@ class PoseDecode:
     applicable).
     """
 
-    @staticmethod
-    def _load_kp(kp, frame_inds):
+    def _load_kp(self, kp, frame_inds):
         """Load keypoints given frame indices.
 
         Args:
@@ -157,8 +156,7 @@ def _load_kp(kp, frame_inds):
 
         return [x[frame_inds].astype(np.float32) for x in kp]
 
-    @staticmethod
-    def _load_kpscore(kpscore, frame_inds):
+    def _load_kpscore(self, kpscore, frame_inds):
         """Load keypoint scores given frame indices.
 
         Args:
@@ -191,7 +189,7 @@ def __call__(self, results):
         return results
 
     def __repr__(self):
-        repr_str = f'{self.__class__.__name__}()'
+        repr_str = (f'{self.__class__.__name__}()')
         return repr_str
 
 
diff --git a/mmaction/datasets/ssn_dataset.py b/mmaction/datasets/ssn_dataset.py
index 8a7f1dd0d2..76d24324df 100644
--- a/mmaction/datasets/ssn_dataset.py
+++ b/mmaction/datasets/ssn_dataset.py
@@ -767,7 +767,7 @@ def prepare_train_frames(self, idx):
         out_proposal_labels = []
         out_proposal_reg_targets = []
 
-        for _, proposal in enumerate(results['out_proposals']):
+        for idx, proposal in enumerate(results['out_proposals']):
             # proposal: [(video_id, SSNInstance), proposal_type]
             num_frames = proposal[0][1].num_video_frames
 
diff --git a/mmaction/models/__init__.py b/mmaction/models/__init__.py
index d612642376..aec46b3c53 100644
--- a/mmaction/models/__init__.py
+++ b/mmaction/models/__init__.py
@@ -15,13 +15,13 @@
                      CrossEntropyLoss, HVULoss, NLLLoss, OHEMHingeLoss,
                      SSNLoss)
 from .necks import TPN
-from .recognizers import (AudioRecognizer, BaseRecognizer, Recognizer2D,
-                          Recognizer3D)
+from .recognizers import (AudioRecognizer, BaseRecognizer, recognizer2d,
+                          recognizer3d, swintransformer3d)
 from .roi_extractors import SingleRoIExtractor3D
 
 __all__ = [
     'BACKBONES', 'HEADS', 'RECOGNIZERS', 'build_recognizer', 'build_head',
-    'build_backbone', 'Recognizer2D', 'Recognizer3D', 'C3D', 'ResNet',
+    'build_backbone', 'recognizer2d', 'recognizer3d', 'swintransformer3d', 'C3D', 'ResNet',
     'ResNet3d', 'ResNet2Plus1d', 'I3DHead', 'TSNHead', 'TSMHead', 'BaseHead',
     'BaseRecognizer', 'LOSSES', 'CrossEntropyLoss', 'NLLLoss', 'HVULoss',
     'ResNetTSM', 'ResNet3dSlowFast', 'SlowFastHead', 'Conv2plus1d',
diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py
index ef1174d9c1..4999d12ba0 100644
--- a/mmaction/models/backbones/__init__.py
+++ b/mmaction/models/backbones/__init__.py
@@ -12,11 +12,9 @@
 from .resnet_tsm import ResNetTSM
 from .tanet import TANet
 from .x3d import X3D
-from .swin_transformer import SwinTransformer3D
-
 
 __all__ = [
     'C3D', 'ResNet', 'ResNet3d', 'ResNetTSM', 'ResNet2Plus1d',
     'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNet3dCSN', 'ResNetTIN', 'X3D',
-    'ResNetAudio', 'ResNet3dLayer', 'MobileNetV2TSM', 'MobileNetV2', 'TANet', 'SwinTransformer3D'
+    'ResNetAudio', 'ResNet3dLayer', 'MobileNetV2TSM', 'MobileNetV2', 'TANet'
 ]
diff --git a/mmaction/models/backbones/mobilenet_v2.py b/mmaction/models/backbones/mobilenet_v2.py
index 5dce73502b..5a093fa1fa 100644
--- a/mmaction/models/backbones/mobilenet_v2.py
+++ b/mmaction/models/backbones/mobilenet_v2.py
@@ -107,8 +107,8 @@ def forward(self, x):
         def _inner_forward(x):
             if self.use_res_connect:
                 return x + self.conv(x)
-
-            return self.conv(x)
+            else:
+                return self.conv(x)
 
         if self.with_cp and x.requires_grad:
             out = cp.checkpoint(_inner_forward, x)
@@ -275,8 +275,8 @@ def forward(self, x):
 
         if len(outs) == 1:
             return outs[0]
-
-        return tuple(outs)
+        else:
+            return tuple(outs)
 
     def _freeze_stages(self):
         if self.frozen_stages >= 0:
diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py
index 79f98e0f7a..83a64801f7 100644
--- a/mmaction/models/backbones/resnet3d.py
+++ b/mmaction/models/backbones/resnet3d.py
@@ -345,15 +345,11 @@ class ResNet3d(nn.Module):
         dilations (Sequence[int]): Dilation of each stage.
             Default: ``(1, 1, 1, 1)``.
         conv1_kernel (Sequence[int]): Kernel size of the first conv layer.
-            Default: ``(3, 7, 7)``.
-        conv1_stride_s (int): Spatial stride of the first conv layer.
-            Default: 2.
+            Default: ``(5, 7, 7)``.
         conv1_stride_t (int): Temporal stride of the first conv layer.
-            Default: 1.
-        pool1_stride_s (int): Spatial stride of the first pooling layer.
             Default: 2.
         pool1_stride_t (int): Temporal stride of the first pooling layer.
-            Default: 1.
+            Default: 2.
         with_pool2 (bool): Whether to use pool2. Default: True.
         style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
             layer is the 3x3 conv layer, otherwise the stride-two layer is
@@ -362,7 +358,7 @@ class ResNet3d(nn.Module):
             not freezing any parameters. Default: -1.
         inflate (Sequence[int]): Inflate Dims of each block.
             Default: (1, 1, 1, 1).
-        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+        inflate_style (str): ``3x1x1`` or ``1x1x1``. which determines the
             kernel sizes and padding strides for conv1 and conv2 in each block.
             Default: '3x1x1'.
         conv_cfg (dict): Config for conv layers. required keys are ``type``
@@ -405,11 +401,11 @@ def __init__(self,
                  spatial_strides=(1, 2, 2, 2),
                  temporal_strides=(1, 1, 1, 1),
                  dilations=(1, 1, 1, 1),
-                 conv1_kernel=(3, 7, 7),
+                 conv1_kernel=(5, 7, 7),
                  conv1_stride_s=2,
-                 conv1_stride_t=1,
+                 conv1_stride_t=2,
                  pool1_stride_s=2,
-                 pool1_stride_t=1,
+                 pool1_stride_t=2,
                  with_pool2=True,
                  style='pytorch',
                  frozen_stages=-1,
@@ -544,7 +540,7 @@ def make_res_layer(block,
                 Default: ``pytorch``.
             inflate (int | Sequence[int]): Determine whether to inflate
                 for each block. Default: 1.
-            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
+            inflate_style (str): ``3x1x1`` or ``1x1x1``. which determines
                 the kernel sizes and padding strides for conv1 and conv2
                 in each block. Default: '3x1x1'.
             non_local (int | Sequence[int]): Determine whether to apply
@@ -879,7 +875,7 @@ class ResNet3dLayer(nn.Module):
             the first 1x1 conv layer. Default: 'pytorch'.
         all_frozen (bool): Frozen all modules in the layer. Default: False.
         inflate (int): Inflate Dims of each block. Default: 1.
-        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+        inflate_style (str): ``3x1x1`` or ``1x1x1``. which determines the
             kernel sizes and padding strides for conv1 and conv2 in each block.
             Default: '3x1x1'.
         conv_cfg (dict): Config for conv layers. required keys are ``type``
@@ -935,7 +931,7 @@ def __init__(self,
         self.pretrained2d = pretrained2d
         self.stage = stage
         # stage index is 0 based
-        assert 0 <= stage <= 3
+        assert stage >= 0 and stage <= 3
         self.base_channels = base_channels
 
         self.spatial_stride = spatial_stride
diff --git a/mmaction/models/backbones/resnet3d_csn.py b/mmaction/models/backbones/resnet3d_csn.py
index 5d041d5450..4539dec01e 100644
--- a/mmaction/models/backbones/resnet3d_csn.py
+++ b/mmaction/models/backbones/resnet3d_csn.py
@@ -84,7 +84,7 @@ class ResNet3dCSN(ResNet3d):
         norm_cfg (dict): Config for norm layers. required keys are `type` and
             `requires_grad`.
             Default: dict(type='BN3d', requires_grad=True, eps=1e-3).
-        inflate_style (str): `3x1x1` or `3x3x3`. which determines the kernel
+        inflate_style (str): `3x1x1` or `1x1x1`. which determines the kernel
             sizes and padding strides for conv1 and conv2 in each block.
             Default: '3x3x3'.
         bottleneck_mode (str): Determine which ways to factorize a 3D
diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py
index be1ea1a2b2..45e9d5a7da 100644
--- a/mmaction/models/backbones/resnet3d_slowfast.py
+++ b/mmaction/models/backbones/resnet3d_slowfast.py
@@ -120,7 +120,7 @@ def make_res_layer(self,
                 Default: ``pytorch``.
             inflate (int | Sequence[int]): Determine whether to inflate
                 for each block. Default: 1.
-            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
+            inflate_style (str): ``3x1x1`` or ``1x1x1``. which determines
                 the kernel sizes and padding strides for conv1 and
                 conv2 in each block. Default: ``3x1x1``.
             non_local (int | Sequence[int]): Determine whether to apply
diff --git a/mmaction/models/backbones/resnet_audio.py b/mmaction/models/backbones/resnet_audio.py
index 63c0ff0d8a..d4fd9e1ece 100644
--- a/mmaction/models/backbones/resnet_audio.py
+++ b/mmaction/models/backbones/resnet_audio.py
@@ -180,7 +180,7 @@ def __init__(self,
         self.in_channels = in_channels
         self.base_channels = base_channels
         self.num_stages = num_stages
-        assert 1 <= num_stages <= 4
+        assert num_stages >= 1 and num_stages <= 4
         self.dilations = dilations
         self.conv1_kernel = conv1_kernel
         self.conv1_stride = conv1_stride
@@ -222,8 +222,8 @@ def __init__(self,
         self.feat_dim = self.block.expansion * self.base_channels * 2**(
             len(self.stage_blocks) - 1)
 
-    @staticmethod
-    def make_res_layer(block,
+    def make_res_layer(self,
+                       block,
                        inplanes,
                        planes,
                        blocks,
@@ -241,7 +241,7 @@ def make_res_layer(block,
             planes (int): Number of channels for the output feature
                 in each block.
             blocks (int): Number of residual blocks.
-            stride (Sequence[int]): Strides of residual blocks of each stage.
+            strides (Sequence[int]): Strides of residual blocks of each stage.
                 Default: (1, 2, 2, 2).
             dilation (int): Spacing between kernel elements. Default: 1.
             factorize (int | Sequence[int]): Determine whether to factorize
diff --git a/mmaction/models/backbones/tanet.py b/mmaction/models/backbones/tanet.py
index bb446ea23d..15d3487d1a 100644
--- a/mmaction/models/backbones/tanet.py
+++ b/mmaction/models/backbones/tanet.py
@@ -41,32 +41,32 @@ def __init__(self, block, num_segments, tam_cfg=dict()):
                                       'on Bottleneck block.')
 
     def forward(self, x):
-        assert isinstance(self.block, Bottleneck)
+        if isinstance(self.block, Bottleneck):
 
-        def _inner_forward(x):
-            """Forward wrapper for utilizing checkpoint."""
-            identity = x
+            def _inner_forward(x):
+                """Forward wrapper for utilizing checkpoint."""
+                identity = x
 
-            out = self.block.conv1(x)
-            out = self.tam(out)
-            out = self.block.conv2(out)
-            out = self.block.conv3(out)
+                out = self.block.conv1(x)
+                out = self.tam(out)
+                out = self.block.conv2(out)
+                out = self.block.conv3(out)
 
-            if self.block.downsample is not None:
-                identity = self.block.downsample(x)
+                if self.block.downsample is not None:
+                    identity = self.block.downsample(x)
 
-            out = out + identity
+                out = out + identity
 
-            return out
+                return out
 
-        if self.block.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
+            if self.block.with_cp and x.requires_grad:
+                out = cp.checkpoint(_inner_forward, x)
+            else:
+                out = _inner_forward(x)
 
-        out = self.block.relu(out)
+            out = self.block.relu(out)
 
-        return out
+            return out
 
 
 @BACKBONES.register_module()
diff --git a/mmaction/models/common/lfb.py b/mmaction/models/common/lfb.py
index f54ae36e31..e942dd165e 100644
--- a/mmaction/models/common/lfb.py
+++ b/mmaction/models/common/lfb.py
@@ -14,7 +14,7 @@
     lmdb_imported = False
 
 
-class LFB:
+class LFB(object):
     """Long-Term Feature Bank (LFB).
 
     LFB is proposed in `Long-Term Feature Banks for Detailed Video
diff --git a/mmaction/models/heads/bbox_head.py b/mmaction/models/heads/bbox_head.py
index 3f3bfeead0..cd2cc52622 100644
--- a/mmaction/models/heads/bbox_head.py
+++ b/mmaction/models/heads/bbox_head.py
@@ -122,8 +122,8 @@ def forward(self, x):
         # We do not predict bbox, so return None
         return cls_score, None
 
-    @staticmethod
-    def get_targets(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg):
+    def get_targets(self, sampling_results, gt_bboxes, gt_labels,
+                    rcnn_train_cfg):
         pos_proposals = [res.pos_bboxes for res in sampling_results]
         neg_proposals = [res.neg_bboxes for res in sampling_results]
         pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
@@ -131,8 +131,7 @@ def get_targets(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg):
                                       pos_gt_labels, rcnn_train_cfg)
         return cls_reg_targets
 
-    @staticmethod
-    def recall_prec(pred_vec, target_vec):
+    def recall_prec(self, pred_vec, target_vec):
         """
         Args:
             pred_vec (tensor[N x C]): each element is either 0 or 1
@@ -145,7 +144,7 @@ def recall_prec(pred_vec, target_vec):
         prec = correct.sum(1) / (pred_vec.sum(1) + 1e-6)
         return recall.mean(), prec.mean()
 
-    def multi_label_accuracy(self, pred, target, thr=0.5):
+    def multilabel_accuracy(self, pred, target, thr=0.5):
         pred = pred.sigmoid()
         pred_vec = pred > thr
         # Target is 0 or 1, so using 0.5 as the borderline is OK
@@ -190,7 +189,7 @@ def loss(self,
             F_loss = self.focal_alpha * (1 - pt)**self.focal_gamma * loss
             losses['loss_action_cls'] = torch.mean(F_loss)
 
-            recall_thr, prec_thr, recall_k, prec_k = self.multi_label_accuracy(
+            recall_thr, prec_thr, recall_k, prec_k = self.multilabel_accuracy(
                 cls_score, labels, thr=0.5)
             losses['recall@thr=0.5'] = recall_thr
             losses['prec@thr=0.5'] = prec_thr
diff --git a/mmaction/models/heads/lfb_infer_head.py b/mmaction/models/heads/lfb_infer_head.py
index 69bdf8ae2a..1111b180c5 100644
--- a/mmaction/models/heads/lfb_infer_head.py
+++ b/mmaction/models/heads/lfb_infer_head.py
@@ -37,7 +37,7 @@ def __init__(self,
                  temporal_pool_type='avg',
                  spatial_pool_type='max'):
         super().__init__()
-        rank, _ = get_dist_info()
+        rank, world_size = get_dist_info()
         if rank == 0:
             if not osp.exists(lfb_prefix_path):
                 print(f'lfb prefix path {lfb_prefix_path} does not exist. '
diff --git a/mmaction/models/heads/misc_head.py b/mmaction/models/heads/misc_head.py
index 66e1b2c3b7..72cdaab547 100644
--- a/mmaction/models/heads/misc_head.py
+++ b/mmaction/models/heads/misc_head.py
@@ -75,7 +75,7 @@ def __init__(self,
             act_cfg=act_cfg)
 
         convs = []
-        for _ in range(num_convs - 1):
+        for i in range(num_convs - 1):
             conv = ConvModule(
                 out_channels,
                 out_channels,
diff --git a/mmaction/models/recognizers/__init__.py b/mmaction/models/recognizers/__init__.py
index 9d0bccd56f..0557ec7a8f 100644
--- a/mmaction/models/recognizers/__init__.py
+++ b/mmaction/models/recognizers/__init__.py
@@ -2,5 +2,5 @@
 from .base import BaseRecognizer
 from .recognizer2d import Recognizer2D
 from .recognizer3d import Recognizer3D
-
-__all__ = ['BaseRecognizer', 'Recognizer2D', 'Recognizer3D', 'AudioRecognizer']
+from .swintransformer3d import SwinTransformer3D
+__all__ = ['BaseRecognizer', 'Recognizer2D', 'Recognizer3D','SwinTransformer3D', 'AudioRecognizer']
diff --git a/mmaction/models/recognizers/base.py b/mmaction/models/recognizers/base.py
index 41164f3bd2..e76a170de3 100644
--- a/mmaction/models/recognizers/base.py
+++ b/mmaction/models/recognizers/base.py
@@ -124,11 +124,11 @@ def init_weights(self):
         """Initialize the model network weights."""
         if self.backbone_from in ['mmcls', 'mmaction2']:
             self.backbone.init_weights()
-        elif self.backbone_from in ['torchvision', 'timm']:
+        elif self.backbone_from == 'torchvision':
             warnings.warn('We do not initialize weights for backbones in '
-                          f'{self.backbone_from}, since the weights for '
-                          f'backbones in {self.backbone_from} are initialized'
-                          'in their __init__ functions.')
+                          'torchvision, since the weights for backbones in '
+                          'torchvision are initialized in their __init__ '
+                          'functions. ')
         else:
             raise NotImplementedError('Unsupported backbone source '
                                       f'{self.backbone_from}!')
@@ -151,8 +151,6 @@ def extract_feat(self, imgs):
         if (hasattr(self.backbone, 'features')
                 and self.backbone_from == 'torchvision'):
             x = self.backbone.features(imgs)
-        elif self.backbone_from == 'timm':
-            x = self.backbone.forward_features(imgs)
         else:
             x = self.backbone(imgs)
         return x
diff --git a/mmaction/models/recognizers/recognizer2d.py b/mmaction/models/recognizers/recognizer2d.py
index 6b4bedba04..d3444845f6 100644
--- a/mmaction/models/recognizers/recognizer2d.py
+++ b/mmaction/models/recognizers/recognizer2d.py
@@ -21,7 +21,7 @@ def forward_train(self, imgs, labels, **kwargs):
 
         x = self.extract_feat(imgs)
 
-        if self.backbone_from in ['torchvision', 'timm']:
+        if self.backbone_from == 'torchvision':
             if len(x.shape) == 4 and (x.shape[2] > 1 or x.shape[3] > 1):
                 # apply adaptive avg pooling
                 x = nn.AdaptiveAvgPool2d(1)(x)
@@ -55,7 +55,7 @@ def _do_test(self, imgs):
 
         x = self.extract_feat(imgs)
 
-        if self.backbone_from in ['torchvision', 'timm']:
+        if self.backbone_from == 'torchvision':
             if len(x.shape) == 4 and (x.shape[2] > 1 or x.shape[3] > 1):
                 # apply adaptive avg pooling
                 x = nn.AdaptiveAvgPool2d(1)(x)
diff --git a/mmaction/utils/__init__.py b/mmaction/utils/__init__.py
index d1478a2630..8cb60fcd7a 100644
--- a/mmaction/utils/__init__.py
+++ b/mmaction/utils/__init__.py
@@ -5,11 +5,9 @@
 from .misc import get_random_string, get_shm_dir, get_thread_id
 from .module_hooks import register_module_hooks
 from .precise_bn import PreciseBNHook
-from .optimizer import DistOptimizerHook
-
 
 __all__ = [
     'get_root_logger', 'collect_env', 'get_random_string', 'get_thread_id',
     'get_shm_dir', 'GradCAM', 'PreciseBNHook', 'import_module_error_class',
-    'import_module_error_func', 'register_module_hooks', 'DistOptimizerHook'
+    'import_module_error_func', 'register_module_hooks'
 ]
diff --git a/mmaction/utils/decorators.py b/mmaction/utils/decorators.py
index 727fa61df3..798bd2f4ff 100644
--- a/mmaction/utils/decorators.py
+++ b/mmaction/utils/decorators.py
@@ -10,6 +10,7 @@ def decorate(func):
         def new_func(*args, **kwargs):
             raise ImportError(
                 f'Please install {module_name} to use {func.__name__}.')
+            return func(*args, **kwargs)
 
         return new_func
 
diff --git a/mmaction/utils/precise_bn.py b/mmaction/utils/precise_bn.py
index 2751b2e736..c01bd4d109 100644
--- a/mmaction/utils/precise_bn.py
+++ b/mmaction/utils/precise_bn.py
@@ -30,7 +30,10 @@ def is_parallel_module(module):
     """
     parallels = (DataParallel, DistributedDataParallel,
                  MMDistributedDataParallel)
-    return bool(isinstance(module, parallels))
+    if isinstance(module, parallels):
+        return True
+    else:
+        return False
 
 
 @torch.no_grad()

From 37910ef3141c7b2eef76544f9ec8bdf26ec94c7d Mon Sep 17 00:00:00 2001
From: vidit <vidit.agarwal.eee20@itbhu.ac.in>
Date: Sun, 20 Mar 2022 06:22:19 +0530
Subject: [PATCH 2/2] Fixed mmaction folder to make Swin-T working and made the
 installation process much faster and easier

---
 environment.yml                               | 424 +++++++++++
 mmaction/core/lr/__init__.py                  |   3 +
 mmaction/core/lr/tin_lr_hook.py               |  39 +
 .../models/recognizers/swintransformer3d.py   | 681 ++++++++++++++++++
 setup.sh                                      |   7 +
 swint.py                                      |  92 +++
 6 files changed, 1246 insertions(+)
 create mode 100644 environment.yml
 create mode 100644 mmaction/core/lr/__init__.py
 create mode 100644 mmaction/core/lr/tin_lr_hook.py
 create mode 100644 mmaction/models/recognizers/swintransformer3d.py
 create mode 100644 setup.sh
 create mode 100644 swint.py

diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000000..deed242848
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,424 @@
+name: swint
+channels:
+  - anaconda
+  - defaults
+  - conda-forge
+dependencies:
+  - _anaconda_depends=2021.11=py37_0
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - alabaster=0.7.12=py37_0
+  - anaconda=custom=py37_1
+  - anaconda-client=1.9.0=py37h06a4308_0
+  - anaconda-project=0.10.2=pyhd3eb1b0_0
+  - anyio=3.5.0=py37h06a4308_0
+  - appdirs=1.4.4=pyhd3eb1b0_0
+  - argcomplete=1.12.3=pyhd3eb1b0_0
+  - argh=0.26.2=py37_0
+  - argon2-cffi=21.3.0=pyhd3eb1b0_0
+  - argon2-cffi-bindings=21.2.0=py37h7f8727e_0
+  - arrow=0.13.1=py37_0
+  - asn1crypto=1.4.0=py_0
+  - astroid=2.6.6=py37h06a4308_0
+  - astropy=4.3.1=py37h09021b7_0
+  - async_generator=1.10=py37h28b3542_0
+  - atomicwrites=1.4.0=py_0
+  - attrs=21.4.0=pyhd3eb1b0_0
+  - autopep8=1.6.0=pyhd3eb1b0_0
+  - babel=2.9.1=pyhd3eb1b0_0
+  - backcall=0.2.0=pyhd3eb1b0_0
+  - backports=1.1=pyhd3eb1b0_0
+  - backports.shutil_get_terminal_size=1.0.0=pyhd3eb1b0_3
+  - beautifulsoup4=4.10.0=pyh06a4308_0
+  - binaryornot=0.4.4=pyhd3eb1b0_1
+  - bitarray=2.3.5=py37h7f8727e_0
+  - bkcharts=0.2=py37_0
+  - black=19.10b0=py_0
+  - blas=1.0=mkl
+  - bleach=4.1.0=pyhd3eb1b0_0
+  - blosc=1.21.0=h8c45485_0
+  - bokeh=2.4.2=py37h06a4308_0
+  - boto=2.49.0=py37_0
+  - bottleneck=1.3.2=py37heb32a55_1
+  - brotli=1.0.9=he6710b0_2
+  - brotlipy=0.7.0=py37h27cfd23_1003
+  - brunsli=0.1=h2531618_0
+  - bzip2=1.0.8=h7b6447c_0
+  - c-ares=1.18.1=h7f8727e_0
+  - ca-certificates=2020.10.14=0
+  - cairo=1.16.0=hf32fb01_1
+  - certifi=2020.6.20=py37_0
+  - cffi=1.15.0=py37hd667e15_1
+  - cfitsio=3.470=hf0d0db6_6
+  - chardet=4.0.0=py37h06a4308_1003
+  - charls=2.2.0=h2531618_0
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - click=8.0.4=py37h06a4308_0
+  - cloudpickle=2.0.0=pyhd3eb1b0_0
+  - clyent=1.2.2=py37_1
+  - colorama=0.4.4=pyhd3eb1b0_0
+  - conda=4.11.0=py37h06a4308_0
+  - conda-content-trust=0.1.1=pyhd3eb1b0_0
+  - conda-pack=0.6.0=pyhd3eb1b0_0
+  - conda-package-handling=1.7.3=py37h27cfd23_1
+  - conda-token=0.3.0=pyhd3eb1b0_0
+  - contextlib2=0.6.0.post1=pyhd3eb1b0_0
+  - cookiecutter=1.7.2=pyhd3eb1b0_0
+  - curl=7.80.0=h7f8727e_0
+  - cycler=0.11.0=pyhd3eb1b0_0
+  - cython=0.29.25=py37hdbfa776_0
+  - cytoolz=0.11.0=py37h7b6447c_0
+  - daal4py=2021.5.0=py37h78b71dc_0
+  - dal=2021.5.1=h06a4308_803
+  - dask=2021.10.0=pyhd3eb1b0_0
+  - dask-core=2021.10.0=pyhd3eb1b0_0
+  - dataclasses=0.8=pyh6d0b6a4_7
+  - dbus=1.13.18=hb2f20db_0
+  - debugpy=1.5.1=py37h295c915_0
+  - decorator=4.4.2=py_0
+  - defusedxml=0.7.1=pyhd3eb1b0_0
+  - diff-match-patch=20200713=pyhd3eb1b0_0
+  - distributed=2021.10.0=py37h06a4308_0
+  - docutils=0.17.1=py37h06a4308_1
+  - entrypoints=0.3=py37_0
+  - et_xmlfile=1.1.0=py37h06a4308_0
+  - expat=2.4.4=h295c915_0
+  - fastcache=1.1.0=py37h7b6447c_0
+  - filelock=3.4.2=pyhd3eb1b0_0
+  - flake8=3.9.2=pyhd3eb1b0_0
+  - flask=1.1.2=pyhd3eb1b0_0
+  - fontconfig=2.13.1=h6c09931_0
+  - freetype=2.11.0=h70c0345_0
+  - fribidi=1.0.10=h7b6447c_0
+  - fsspec=2022.1.0=pyhd3eb1b0_0
+  - get_terminal_size=1.0.0=haa9412d_0
+  - gevent=21.8.0=py37h7f8727e_1
+  - giflib=5.2.1=h7b6447c_0
+  - glib=2.69.1=h4ff587b_1
+  - glob2=0.7=pyhd3eb1b0_0
+  - gmp=6.2.1=h2531618_2
+  - gmpy2=2.1.2=py37heeb90bb_0
+  - graphite2=1.3.14=h23475e2_0
+  - greenlet=1.1.1=py37h295c915_0
+  - gst-plugins-base=1.14.0=h8213a91_2
+  - gstreamer=1.14.0=h28cd5cc_2
+  - h5py=2.10.0=py37h7918eee_0
+  - harfbuzz=2.8.1=h6f93f22_0
+  - hdf5=1.10.4=hb1b8bf9_0
+  - heapdict=1.0.1=pyhd3eb1b0_0
+  - html5lib=1.1=pyhd3eb1b0_0
+  - icu=58.2=he6710b0_3
+  - idna=3.3=pyhd3eb1b0_0
+  - imagecodecs=2021.8.26=py37h4cda21f_0
+  - imageio=2.9.0=pyhd3eb1b0_0
+  - imagesize=1.3.0=pyhd3eb1b0_0
+  - importlib-metadata=4.8.2=py37h06a4308_0
+  - importlib_metadata=4.8.2=hd3eb1b0_0
+  - inflection=0.5.1=py37h06a4308_0
+  - iniconfig=1.1.1=pyhd3eb1b0_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - intervaltree=3.1.0=pyhd3eb1b0_0
+  - ipykernel=6.4.1=py37h06a4308_1
+  - ipython=7.31.1=py37h06a4308_0
+  - ipython_genutils=0.2.0=pyhd3eb1b0_1
+  - ipywidgets=7.6.5=pyhd3eb1b0_1
+  - itsdangerous=2.0.1=pyhd3eb1b0_0
+  - jbig=2.1=hdba287a_0
+  - jdcal=1.4.1=pyhd3eb1b0_0
+  - jedi=0.18.1=py37h06a4308_1
+  - jeepney=0.7.1=pyhd3eb1b0_0
+  - jinja2=2.11.3=pyhd3eb1b0_0
+  - jinja2-time=0.2.0=pyhd3eb1b0_2
+  - joblib=1.1.0=pyhd3eb1b0_0
+  - jpeg=9d=h7f8727e_0
+  - json5=0.9.6=pyhd3eb1b0_0
+  - jsonschema=3.2.0=pyhd3eb1b0_2
+  - jupyter=1.0.0=py37_7
+  - jupyter_client=6.1.12=pyhd3eb1b0_0
+  - jupyter_console=6.4.0=pyhd3eb1b0_0
+  - jupyter_core=4.9.1=py37h06a4308_0
+  - jupyter_server=1.13.5=pyhd3eb1b0_0
+  - jupyterlab=3.2.9=pyhd3eb1b0_0
+  - jupyterlab_pygments=0.1.2=py_0
+  - jupyterlab_server=2.10.3=pyhd3eb1b0_1
+  - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
+  - jxrlib=1.1=h7b6447c_2
+  - keyring=23.4.0=py37h06a4308_0
+  - kiwisolver=1.3.2=py37h295c915_0
+  - krb5=1.19.2=hac12032_0
+  - lazy-object-proxy=1.6.0=py37h27cfd23_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.35.1=h7274673_9
+  - lerc=3.0=h295c915_0
+  - libaec=1.0.4=he6710b0_1
+  - libarchive=3.4.2=h62408e4_0
+  - libcurl=7.80.0=h0b77cf5_0
+  - libdeflate=1.8=h7f8727e_5
+  - libedit=3.1.20210910=h7f8727e_0
+  - libev=4.33=h7f8727e_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgfortran-ng=7.5.0=ha8ba4b0_17
+  - libgfortran4=7.5.0=ha8ba4b0_17
+  - libgomp=9.3.0=h5101ec6_17
+  - liblief=0.10.1=he6710b0_0
+  - libllvm11=11.1.0=h3826bc1_0
+  - libnghttp2=1.46.0=hce63b2e_0
+  - libpng=1.6.37=hbc83047_0
+  - libsodium=1.0.18=h7b6447c_0
+  - libspatialindex=1.9.3=h2531618_0
+  - libssh2=1.9.0=h1ba5d50_1
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - libtiff=4.2.0=h85742a9_0
+  - libtool=2.4.6=h295c915_1008
+  - libuuid=1.0.3=h7f8727e_2
+  - libuv=1.40.0=h7b6447c_0
+  - libwebp=1.2.2=h55f646e_0
+  - libwebp-base=1.2.2=h7f8727e_0
+  - libxcb=1.14=h7b6447c_0
+  - libxml2=2.9.12=h03d6c58_0
+  - libxslt=1.1.34=hc22bd24_0
+  - libzopfli=1.0.3=he6710b0_0
+  - llvmlite=0.37.0=py37h295c915_1
+  - locket=0.2.1=py37h06a4308_1
+  - lxml=4.7.1=py37h1f438cf_1
+  - lz4-c=1.9.3=h295c915_1
+  - lzo=2.10=h7b6447c_2
+  - markupsafe=1.1.1=py37h14c3975_1
+  - matplotlib=3.5.1=py37h06a4308_0
+  - matplotlib-base=3.5.1=py37ha18d171_0
+  - matplotlib-inline=0.1.2=pyhd3eb1b0_2
+  - mccabe=0.6.1=py37_1
+  - mistune=0.8.4=py37h14c3975_1001
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py37h7f8727e_0
+  - mkl_fft=1.3.1=py37hd3c417c_0
+  - mkl_random=1.2.2=py37h51133e4_0
+  - mock=4.0.3=pyhd3eb1b0_0
+  - more-itertools=8.12.0=pyhd3eb1b0_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpi=1.0=mpich
+  - mpich=3.3.2=hc856adb_0
+  - mpmath=1.2.1=py37h06a4308_0
+  - msgpack-python=1.0.2=py37hff7bd54_1
+  - multipledispatch=0.6.0=py37_0
+  - munkres=1.1.4=py_0
+  - mypy_extensions=0.4.3=py37h06a4308_1
+  - nbclassic=0.3.5=pyhd3eb1b0_0
+  - nbclient=0.5.11=pyhd3eb1b0_0
+  - nbconvert=6.3.0=py37h06a4308_0
+  - nbformat=5.1.3=pyhd3eb1b0_0
+  - ncurses=6.3=h7f8727e_2
+  - nest-asyncio=1.5.1=pyhd3eb1b0_0
+  - networkx=2.6.3=pyhd3eb1b0_0
+  - nltk=3.7=pyhd3eb1b0_0
+  - nose=1.3.7=pyhd3eb1b0_1008
+  - notebook=6.4.8=py37h06a4308_0
+  - numba=0.54.1=py37h51133e4_0
+  - numexpr=2.8.1=py37h6abb31d_0
+  - numpy=1.20.3=py37hf144106_0
+  - numpy-base=1.20.3=py37h74d4b33_0
+  - numpydoc=1.2=pyhd3eb1b0_0
+  - olefile=0.46=py37_0
+  - openjpeg=2.4.0=h3ad879b_0
+  - openpyxl=3.0.9=pyhd3eb1b0_0
+  - openssl=1.1.1m=h7f8727e_0
+  - packaging=21.3=pyhd3eb1b0_0
+  - pandas=1.3.4=py37h8c16a72_0
+  - pandocfilters=1.5.0=pyhd3eb1b0_0
+  - pango=1.45.3=hd140c19_0
+  - parso=0.8.3=pyhd3eb1b0_0
+  - partd=1.2.0=pyhd3eb1b0_0
+  - patchelf=0.13=h295c915_0
+  - path=16.2.0=pyhd3eb1b0_0
+  - path.py=12.5.0=hd3eb1b0_0
+  - pathlib2=2.3.6=py37h06a4308_2
+  - pathspec=0.7.0=py_0
+  - patsy=0.5.2=py37h06a4308_1
+  - pcre=8.45=h295c915_0
+  - pep8=1.7.1=py37_0
+  - pexpect=4.8.0=pyhd3eb1b0_3
+  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pillow=9.0.1=py37h22f2fdc_0
+  - pip
+  - pixman=0.40.0=h7f8727e_1
+  - pkginfo=1.8.2=pyhd3eb1b0_0
+  - pluggy=1.0.0=py37h06a4308_0
+  - ply=3.11=py37_0
+  - poyo=0.5.0=pyhd3eb1b0_0
+  - prometheus_client=0.13.1=pyhd3eb1b0_0
+  - prompt-toolkit=3.0.20=pyhd3eb1b0_0
+  - prompt_toolkit=3.0.20=hd3eb1b0_0
+  - psutil=5.8.0=py37h27cfd23_1
+  - ptyprocess=0.7.0=pyhd3eb1b0_2
+  - py=1.11.0=pyhd3eb1b0_0
+  - py-lief=0.10.1=py37h403a769_0
+  - pycodestyle=2.7.0=pyhd3eb1b0_0
+  - pycosat=0.6.3=py37h27cfd23_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pycrypto=2.6.1=py37h7b6447c_10
+  - pycurl=7.44.1=py37h8f2d780_1
+  - pydocstyle=6.1.1=pyhd3eb1b0_0
+  - pyerfa=2.0.0=py37h27cfd23_0
+  - pyflakes=2.3.1=pyhd3eb1b0_0
+  - pygments=2.11.2=pyhd3eb1b0_0
+  - pylint=2.9.6=py37h06a4308_1
+  - pyls-spyder=0.4.0=pyhd3eb1b0_0
+  - pyodbc=4.0.32=py37h295c915_0
+  - pyopenssl=22.0.0=pyhd3eb1b0_0
+  - pyparsing=3.0.4=pyhd3eb1b0_0
+  - pyqt=5.9.2=py37h05f1152_2
+  - pyrsistent=0.18.0=py37heee7806_0
+  - pysocks=1.7.1=py37_1
+  - pytables=3.6.1=py37h71ec239_0
+  - pytest=6.2.5=py37h06a4308_2
+  - python=3.7.11=h12debd9_0
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - python-libarchive-c=2.9=pyhd3eb1b0_1
+  - python-lsp-black=1.0.0=pyhd3eb1b0_0
+  - python-lsp-jsonrpc=1.0.0=pyhd3eb1b0_0
+  - python-lsp-server=1.2.4=pyhd3eb1b0_0
+  - python-slugify=5.0.2=pyhd3eb1b0_0
+  - pytz=2021.3=pyhd3eb1b0_0
+  - pywavelets=1.1.1=py37h7b6447c_2
+  - pyxdg=0.27=pyhd3eb1b0_0
+  - pyyaml=6.0=py37h7f8727e_1
+  - pyzmq=22.3.0=py37h295c915_2
+  - qdarkstyle=3.0.2=pyhd3eb1b0_0
+  - qstylizer=0.1.10=pyhd3eb1b0_0
+  - qt=5.9.7=h5867ecd_1
+  - qtawesome=1.0.3=pyhd3eb1b0_0
+  - qtconsole=5.2.2=pyhd3eb1b0_0
+  - qtpy=1.11.2=pyhd3eb1b0_0
+  - readline=8.1.2=h7f8727e_1
+  - regex=2021.11.2=py37h7f8727e_0
+  - requests=2.27.1=pyhd3eb1b0_0
+  - ripgrep=12.1.1=0
+  - rope=0.22.0=pyhd3eb1b0_0
+  - rtree=0.9.7=py37h06a4308_1
+  - ruamel_yaml=0.15.100=py37h27cfd23_0
+  - scikit-image=0.18.3=py37h51133e4_0
+  - scikit-learn=1.0.2=py37h51133e4_1
+  - scikit-learn-intelex=2021.5.0=py37h06a4308_0
+  - scipy=1.7.3=py37hc147768_0
+  - seaborn=0.11.2=pyhd3eb1b0_0
+  - secretstorage=3.3.1=py37h06a4308_0
+  - send2trash=1.8.0=pyhd3eb1b0_1
+  - setuptools=58.0.4=py37h06a4308_0
+  - simplegeneric=0.8.1=py37_2
+  - singledispatch=3.7.0=pyhd3eb1b0_1001
+  - sip=4.19.8=py37hf484d3e_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - snappy=1.1.8=he6710b0_0
+  - sniffio=1.2.0=py37h06a4308_1
+  - snowballstemmer=2.2.0=pyhd3eb1b0_0
+  - sortedcollections=2.1.0=pyhd3eb1b0_0
+  - sortedcontainers=2.4.0=pyhd3eb1b0_0
+  - soupsieve=2.3.1=pyhd3eb1b0_0
+  - sphinx=4.4.0=pyhd3eb1b0_0
+  - sphinxcontrib=1.0=py37_1
+  - sphinxcontrib-applehelp=1.0.2=pyhd3eb1b0_0
+  - sphinxcontrib-devhelp=1.0.2=pyhd3eb1b0_0
+  - sphinxcontrib-htmlhelp=2.0.0=pyhd3eb1b0_0
+  - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0
+  - sphinxcontrib-qthelp=1.0.3=pyhd3eb1b0_0
+  - sphinxcontrib-serializinghtml=1.1.5=pyhd3eb1b0_0
+  - sphinxcontrib-websupport=1.2.4=py_0
+  - spyder=5.1.5=py37h06a4308_1
+  - spyder-kernels=2.1.3=py37h06a4308_0
+  - sqlalchemy=1.4.27=py37h7f8727e_0
+  - sqlite=3.37.2=hc218d9a_0
+  - statsmodels=0.12.2=py37h27cfd23_0
+  - sympy=1.9=py37h06a4308_0
+  - tbb=2021.5.0=hd09550d_0
+  - tbb4py=2021.5.0=py37hd09550d_0
+  - tblib=1.7.0=pyhd3eb1b0_0
+  - terminado=0.13.1=py37h06a4308_0
+  - testpath=0.5.0=pyhd3eb1b0_0
+  - text-unidecode=1.3=pyhd3eb1b0_0
+  - textdistance=4.2.1=pyhd3eb1b0_0
+  - threadpoolctl=2.2.0=pyh0d69192_0
+  - three-merge=0.1.1=pyhd3eb1b0_0
+  - tifffile=2021.7.2=pyhd3eb1b0_2
+  - tinycss=0.4=pyhd3eb1b0_1002
+  - tk=8.6.11=h1ccaba5_0
+  - toml=0.10.2=pyhd3eb1b0_0
+  - toolz=0.11.2=pyhd3eb1b0_0
+  - tornado=6.1=py37h27cfd23_0
+  - tqdm=4.62.3=pyhd3eb1b0_1
+  - traitlets=5.1.1=pyhd3eb1b0_0
+  - typed-ast=1.4.3=py37h7f8727e_1
+  - typing-extensions=3.10.0.2=hd3eb1b0_0
+  - typing_extensions=3.10.0.2=pyh06a4308_0
+  - ujson=4.2.0=py37h295c915_0
+  - unicodecsv=0.14.1=py37_0
+  - unidecode=1.2.0=pyhd3eb1b0_0
+  - unixodbc=2.3.9=h7b6447c_0
+  - urllib3=1.26.8=pyhd3eb1b0_0
+  - watchdog=2.1.6=py37h06a4308_0
+  - wcwidth=0.2.5=pyhd3eb1b0_0
+  - webencodings=0.5.1=py37_1
+  - websocket-client=0.58.0=py37h06a4308_4
+  - werkzeug=2.0.3=pyhd3eb1b0_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - whichcraft=0.6.1=pyhd3eb1b0_0
+  - widgetsnbextension=3.5.2=py37h06a4308_0
+  - wrapt=1.12.1=py37h7b6447c_1
+  - wurlitzer=3.0.2=py37h06a4308_0
+  - xlrd=2.0.1=pyhd3eb1b0_0
+  - xlsxwriter=3.0.2=pyhd3eb1b0_0
+  - xlwt=1.3.0=py37_0
+  - xz=5.2.5=h7b6447c_0
+  - yaml=0.2.5=h7b6447c_0
+  - yapf=0.31.0=pyhd3eb1b0_0
+  - zeromq=4.3.4=h2531618_0
+  - zfp=0.5.5=h295c915_6
+  - zict=2.0.0=pyhd3eb1b0_0
+  - zipp=3.7.0=pyhd3eb1b0_0
+  - zlib=1.2.11=h7f8727e_4
+  - zope=1.0=py37_1
+  - zope.event=4.5.0=py37_0
+  - zope.interface=5.4.0=py37h7f8727e_0
+  - zstd=1.4.9=haebb681_0
+  - pip:
+    - addict==2.4.0
+    - audioread==2.1.9
+    - av==8.1.0
+    - coverage==6.3.2
+    - cryptography==2.8
+    - decord==0.6.0
+    - einops==0.4.1
+    - flatbuffers==2.0
+    - fonttools==4.29.1
+    - imageio-ffmpeg==0.4.5
+    - imgaug==0.4.0
+    - interrogate==1.5.0
+    - isort==4.3.21
+    - librosa==0.9.1
+    - lmdb==1.3.0
+    - mmaction2==0.15.0
+    - mmcv==1.3.1
+    - moviepy==1.0.3
+    - onnx==1.11.0
+    - onnxruntime==1.10.0
+    - opencv-contrib-python==4.5.5.62
+    - opencv-python-headless==4.5.5.62
+    - pep517==0.12.0
+    - pip==22.0.3
+    - pooch==1.6.0
+    - proglog==0.1.9
+    - protobuf==3.19.4
+    - pytest-runner==6.0.0
+    - pyturbojpeg==1.6.5
+    - resampy==0.2.2
+    - shapely==1.8.1.post1
+    - soundfile==0.10.3.post1
+    - tabulate==0.8.9
+    - timm==0.5.4
+    - tomli==2.0.1
+    - torch==1.10.2
+    - torchvision==0.11.3
+    - webcolors==1.11.1
+    - xdoctest==0.15.10
+prefix: /srv/conda/envs/swint
diff --git a/mmaction/core/lr/__init__.py b/mmaction/core/lr/__init__.py
new file mode 100644
index 0000000000..f2a29754b1
--- /dev/null
+++ b/mmaction/core/lr/__init__.py
@@ -0,0 +1,3 @@
+from .tin_lr_hook import TINLrUpdaterHook
+
+__all__ = ['TINLrUpdaterHook']
diff --git a/mmaction/core/lr/tin_lr_hook.py b/mmaction/core/lr/tin_lr_hook.py
new file mode 100644
index 0000000000..4fededd585
--- /dev/null
+++ b/mmaction/core/lr/tin_lr_hook.py
@@ -0,0 +1,39 @@
+from mmcv.runner import HOOKS, LrUpdaterHook
+from mmcv.runner.hooks.lr_updater import annealing_cos
+
+
+@HOOKS.register_module()
+class TINLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, min_lr, **kwargs):
+        self.min_lr = min_lr
+        super(TINLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_warmup_lr(self, cur_iters):
+        if self.warmup == 'linear':
+            # 'linear' warmup is rewritten according to TIN repo:
+            # https://github.com/deepcs233/TIN/blob/master/main.py#L409-L412
+            k = (cur_iters / self.warmup_iters) * (
+                1 - self.warmup_ratio) + self.warmup_ratio
+            warmup_lr = [_lr * k for _lr in self.regular_lr]
+        elif self.warmup == 'constant':
+            warmup_lr = [_lr * self.warmup_ratio for _lr in self.regular_lr]
+        elif self.warmup == 'exp':
+            k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+            warmup_lr = [_lr * k for _lr in self.regular_lr]
+        return warmup_lr
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+
+        target_lr = self.min_lr
+        if self.warmup is not None:
+            progress = progress - self.warmup_iters
+            max_progress = max_progress - self.warmup_iters
+        factor = progress / max_progress
+        return annealing_cos(base_lr, target_lr, factor)
diff --git a/mmaction/models/recognizers/swintransformer3d.py b/mmaction/models/recognizers/swintransformer3d.py
new file mode 100644
index 0000000000..9f41a63e82
--- /dev/null
+++ b/mmaction/models/recognizers/swintransformer3d.py
@@ -0,0 +1,681 @@
+'''
+Credit to the official implementation: https://github.com/SwinTransformer/Video-Swin-Transformer
+'''
+
+import torch.nn as nn
+import torch.nn.functional as F
+# import torch.utils.checkpoint as checkpoint
+import numpy as np
+from timm.models.layers import DropPath, trunc_normal_
+
+from functools import reduce, lru_cache
+from operator import mul
+from einops import rearrange
+
+
+# def get_root_logger(log_file=None, log_level=logging.INFO):
+#     """Use ``get_logger`` method in mmcv to get the root logger.
+#     The logger will be initialized if it has not been initialized. By default a
+#     StreamHandler will be added. If ``log_file`` is specified, a FileHandler
+#     will also be added. The name of the root logger is the top-level package
+#     name, e.g., "mmaction".
+#     Args:
+#         log_file (str | None): The log filename. If specified, a FileHandler
+#             will be added to the root logger.
+#         log_level (int): The root logger level. Note that only the process of
+#             rank 0 is affected, while other processes will set the level to
+#             "Error" and be silent most of the time.
+#     Returns:
+#         :obj:`logging.Logger`: The root logger.
+#     """
+#     return get_logger(__name__.split('.')[0], log_file, log_level)
+
+
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, D, H, W, C)
+        window_size (tuple[int]): window size
+    Returns:
+        windows: (B*num_windows, window_size*window_size, C)
+    """
+    B, D, H, W, C = x.shape
+    x = x.view(B, D // window_size[0], window_size[0], H // window_size[1], window_size[1], W // window_size[2],
+               window_size[2], C)
+    windows = x.permute(0, 1, 3, 5, 2, 4, 6, 7).contiguous().view(-1, reduce(mul, window_size), C)
+    return windows
+
+
+def window_reverse(windows, window_size, B, D, H, W):
+    """
+    Args:
+        windows: (B*num_windows, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, D, H, W, C)
+    """
+    x = windows.view(B, D // window_size[0], H // window_size[1], W // window_size[2], window_size[0], window_size[1],
+                     window_size[2], -1)
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1)
+    return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+class WindowAttention3D(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The temporal length, height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1) * (2 * window_size[2] - 1),
+                        num_heads))  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_d = torch.arange(self.window_size[0])
+        coords_h = torch.arange(self.window_size[1])
+        coords_w = torch.arange(self.window_size[2])
+        coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w))  # 3, Wd, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, N, N) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index[:N, :N].reshape(-1)].reshape(
+            N, N, -1)  # Wd*Wh*Ww,Wd*Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wd*Wh*Ww, Wd*Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)  # B_, nH, N, N
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock3D(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (tuple[int]): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, num_heads, window_size=(2, 7, 7), shift_size=(0, 0, 0),
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        # self.use_checkpoint=use_checkpoint
+
+        assert 0 <= self.shift_size[0] < self.window_size[0], "shift_size must in 0-window_size"
+        assert 0 <= self.shift_size[1] < self.window_size[1], "shift_size must in 0-window_size"
+        assert 0 <= self.shift_size[2] < self.window_size[2], "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3D(
+            dim, window_size=self.window_size, num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward_part1(self, x, mask_matrix):
+        B, D, H, W, C = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size, self.shift_size)
+
+        x = self.norm1(x)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = pad_d0 = 0
+        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
+        _, Dp, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if any(i > 0 for i in shift_size):
+            shifted_x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, window_size)  # B*nW, Wd*Wh*Ww, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, *(window_size + (C,)))
+        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp, Wp)  # B D' H' W' C
+        # reverse cyclic shift
+        if any(i > 0 for i in shift_size):
+            x = torch.roll(shifted_x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3))
+        else:
+            x = shifted_x
+
+        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+            x = x[:, :D, :H, :W, :].contiguous()
+        return x
+
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+            mask_matrix: Attention mask for cyclic shift.
+        """
+
+        shortcut = x
+        # if self.use_checkpoint:
+        #     x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
+        # else:
+        x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+
+        # if self.use_checkpoint:
+        #     x = x + checkpoint.checkpoint(self.forward_part2, x)
+        # else:
+        x = x + self.forward_part2(x)
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+        """
+        B, D, H, W, C = x.shape
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+# cache each stage results
+@lru_cache()
+def compute_mask(D, H, W, window_size, shift_size, device):
+    img_mask = torch.zeros((1, D, H, W, 1), device=device)  # 1 Dp Hp Wp 1
+    cnt = 0
+    for d in slice(-window_size[0]), slice(-window_size[0], -shift_size[0]), slice(-shift_size[0], None):
+        for h in slice(-window_size[1]), slice(-window_size[1], -shift_size[1]), slice(-shift_size[1], None):
+            for w in slice(-window_size[2]), slice(-window_size[2], -shift_size[2]), slice(-shift_size[2], None):
+                img_mask[:, d, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, ws[0]*ws[1]*ws[2], 1
+    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (tuple[int]): Local window size. Default: (1,7,7).
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=(1, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock3D(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_checkpoint=use_checkpoint,
+            )
+            for i in range(depth)])
+
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+
+    def forward(self, x):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, C, D, H, W).
+        """
+        # calculate attention mask for SW-MSA
+        B, C, D, H, W = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size, self.shift_size)
+        x = rearrange(x, 'b c d h w -> b d h w c')
+        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device)
+        for blk in self.blocks:
+            x = blk(x, attn_mask)
+        x = x.view(B, D, H, W, -1)
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        x = rearrange(x, 'b d h w c -> b c d h w')
+        return x
+
+
+class PatchEmbed3D(nn.Module):
+    """ Video to Patch Embedding.
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=(2, 4, 4), in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
+
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
+
+        return x
+
+
+import torch
+from torch import nn
+from mmaction.models.heads.i3d_head import I3DHead
+
+
+class SwinTransformer3D(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer: Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 pretrained2d=True,
+                 patch_size=(4, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=(2, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=False,
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.window_size = window_size
+        self.patch_size = patch_size
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed3D(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if i_layer < self.num_layers - 1 else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+
+        # add a norm layer for each output
+        self.norm = norm_layer(self.num_features)
+        self._freeze_stages()
+        self.I3HD = I3DHead(num_classes=400, in_channels=self.num_features)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    # def inflate_weights(self, logger):
+    #     """Inflate the swin2d parameters to swin3d.
+    #     The differences between swin3d and swin2d mainly lie in an extra
+    #     axis. To utilize the pretrained parameters in 2d model,
+    #     the weight of swin2d models should be inflated to fit in the shapes of
+    #     the 3d counterpart.
+    #     Args:
+    #         logger (logging.Logger): The logger used to print
+    #             debugging infomation.
+    #     """
+    #     checkpoint = torch.load(self.pretrained, map_location='cpu')
+    #     state_dict = checkpoint['model']
+    #
+    #     # delete relative_position_index since we always re-init it
+    #     relative_position_index_keys = [k for k in state_dict.keys() if "relative_position_index" in k]
+    #     for k in relative_position_index_keys:
+    #         del state_dict[k]
+    #
+    #     # delete attn_mask since we always re-init it
+    #     attn_mask_keys = [k for k in state_dict.keys() if "attn_mask" in k]
+    #     for k in attn_mask_keys:
+    #         del state_dict[k]
+    #
+    #     state_dict['patch_embed.proj.weight'] = state_dict['patch_embed.proj.weight'].unsqueeze(2).repeat(1,1,self.patch_size[0],1,1) / self.patch_size[0]
+    #
+    #     # bicubic interpolate relative_position_bias_table if not match
+    #     relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
+    #     for k in relative_position_bias_table_keys:
+    #         relative_position_bias_table_pretrained = state_dict[k]
+    #         relative_position_bias_table_current = self.state_dict()[k]
+    #         L1, nH1 = relative_position_bias_table_pretrained.size()
+    #         L2, nH2 = relative_position_bias_table_current.size()
+    #         L2 = (2*self.window_size[1]-1) * (2*self.window_size[2]-1)
+    #         wd = self.window_size[0]
+    #         if nH1 != nH2:
+    #             logger.warning(f"Error in loading {k}, passing")
+    #         else:
+    #             if L1 != L2:
+    #                 S1 = int(L1 ** 0.5)
+    #                 relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
+    #                     relative_position_bias_table_pretrained.permute(1, 0).view(1, nH1, S1, S1), size=(2*self.window_size[1]-1, 2*self.window_size[2]-1),
+    #                     mode='bicubic')
+    #                 relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.view(nH2, L2).permute(1, 0)
+    #         state_dict[k] = relative_position_bias_table_pretrained.repeat(2*wd-1,1)
+    #
+    #     msg = self.load_state_dict(state_dict, strict=False)
+    #     logger.info(msg)
+    #     logger.info(f"=> loaded successfully '{self.pretrained}'")
+    #     del checkpoint
+    #     torch.cuda.empty_cache()
+
+    # def init_weights(self, pretrained=None):
+    #     """Initialize the weights in backbone.
+    #     Args:
+    #         pretrained (str, optional): Path to pre-trained weights.
+    #             Defaults to None.
+    #     """
+    #     def _init_weights(m):
+    #         if isinstance(m, nn.Linear):
+    #             trunc_normal_(m.weight, std=.02)
+    #             if isinstance(m, nn.Linear) and m.bias is not None:
+    #                 nn.init.constant_(m.bias, 0)
+    #         elif isinstance(m, nn.LayerNorm):
+    #             nn.init.constant_(m.bias, 0)
+    #             nn.init.constant_(m.weight, 1.0)
+    #
+    #     if pretrained:
+    #         self.pretrained = pretrained
+    #     if isinstance(self.pretrained, str):
+    #         self.apply(_init_weights)
+    #         logger = get_root_logger()
+    #         logger.info(f'load model from: {self.pretrained}')
+    #
+    #         if self.pretrained2d:
+    #             # Inflate 2D model into 3D model.
+    #             self.inflate_weights(logger)
+    #         else:
+    #             # Directly load 3D model.
+    #             load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+    #     elif self.pretrained is None:
+    #         self.apply(_init_weights)
+    #     else:
+    #         raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x.contiguous())
+
+        x = rearrange(x, 'n c d h w -> n d h w c')
+        x = self.norm(x)
+        x = rearrange(x, 'n d h w c -> n c d h w')
+        x = self.I3HD(x)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer3D, self).train(mode)
+        self._freeze_stages()
diff --git a/setup.sh b/setup.sh
new file mode 100644
index 0000000000..2a34e2cfcc
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+conda env create -f environment.yml
+conda activate swint
+pip install gdown
+gdown https://drive.google.com/uc?id=1CC0DwkrJ3Lb-DhHXrmQ8g6mCr3I74umf
+cp -r mmaction/* /srv/conda/envs/swint/lib/python3.7/site-packages/mmaction/
+https://drive.google.com/file/d/1z6Wqx2y0rUD_YyAWiwEAD8dgvHvAAgaD/view?usp=sharing
\ No newline at end of file
diff --git a/swint.py b/swint.py
new file mode 100644
index 0000000000..f4ff254e58
--- /dev/null
+++ b/swint.py
@@ -0,0 +1,92 @@
+"""
+On the terminal run:
+        mkdir weights
+        cd weights
+        gdown https://drive.google.com/uc?id=10_ArqSj837hBzoQTq3RPGBZgKbBvNfSe
+to use the use_pretrained function with default parameters.
+"""
+import cv2
+import requests
+import torch
+from torchvision import transforms
+from tqdm import tqdm
+
+from mmaction.models.recognizers.swintransformer3d import SwinTransformer3D
+
+
+def video2img(video_path: str):
+    """
+    Converts a video to a torch tensor of (channels, frames, height, width).
+    Args:
+        video_path: path to the video.
+    Returns:
+        torch tensor of (channels, frames, height, width).
+    """
+    vidcap = cv2.VideoCapture(video_path)
+    success, image = vidcap.read()
+    count = 0
+    l = []
+    transform = transforms.Compose([
+        transforms.Resize(224),
+        transforms.CenterCrop(224),
+        transforms.Normalize(
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375]
+        ),
+    ])
+    while success:
+        if count % 20 == 0:
+            l.append(
+                transform(
+                    torch.tensor(image).type(
+                        torch.FloatTensor).permute(2, 0, 1)
+                ).unsqueeze(dim=0)
+            )
+        success, image = vidcap.read()
+        count += 1
+    return torch.stack(l, dim=2)
+
+
+def use_pretrained(model,
+                   folder='weights/',
+                   file_name="swint_victim_pretrained.pth",
+                   download=False, url=None, ):
+    """
+    Loads a pretrained model.
+    Args:
+        model: model to load the weights to.
+        folder: folder to load the weights from.
+        file_name: name of the file to load the weights from.
+        download: whether to download the weights from the url.
+        url: url to download the weights from.
+    Returns:
+        model with loaded weights.
+    """
+    if download:
+        response = requests.get(url, stream=True)
+        t = int(response.headers.get('content-length', 0))
+        block_size = 1024 ** 2
+        progress_bar = tqdm(total=t, unit='iB', unit_scale=True)
+        with open(f"weights/{file_name}", 'wb') as file:
+            for data in response.iter_content(block_size):
+                progress_bar.update(len(data))
+                file.write(data)
+        progress_bar.close()
+        if (t != 0) and (progress_bar.n != t):
+            print("ERROR downloading weights!")
+            return -1
+        print(f"Weights downloaded in {folder} directory!")
+    model.load_state_dict(torch.load(os.path.join(folder, file_name)))
+    return model
+
+
+model = SwinTransformer3D()
+use_pretrained(model)
+# The input must be of the form (batchSize, channels, frames, height, width)
+loss_func = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.AdamW(
+    student_model.parameters(),
+    lr=0.001,
+    betas=(0.9, 0.999),
+    weight_decay=0.02
+)