Trying to fix both tests and linting (#258)

* trying to fix both tests and linting * fixing flake8 versiion * trying to fix flake8 * fixing linting errors * fixing linting CI * fixing linting CI * fixing linting CI * fixing linting CI * fixing linting CI * fixing linting CI * fixing linting CI * fixing linting CI * fixing linting CI * fixing linting CI * fix linting * fix linting * trying to make flake8 work
owkin · Nov 18, 2022 · f1e0564 · f1e0564
1 parent 8963217
commit f1e0564
Show file tree

Hide file tree

Showing 57 changed files with 210 additions and 394 deletions.
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -23,9 +23,14 @@ jobs:
 
       - name: Install dependencies
         run: pip install isort black==22.3.0
+             pip install flake8
 
       - name: Run black
-        run: black --check .
+        run: black --line-length=89 --check .
+
+
+      - name: Run FLAKE8
+        run: flake8 --max-line-length=89 --per-file-ignores="*/__init__.py:F401" ./flamby
 
       - name: Run isort
         run: isort .
diff --git a/.github/workflows/pr_validation.yml b/.github/workflows/pr_validation.yml
@@ -15,6 +15,8 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v2
+        with:
+          python-version: '3.10' 
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

diff --git a/docs/conf.py b/docs/conf.py
@@ -175,7 +175,7 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, "FLamby.tex", "FLamby Documentation", "Collaboration", "manual"),
+    (master_doc, "FLamby.tex", "FLamby Documentation", "Collaboration", "manual")
 ]
 
 
@@ -200,7 +200,7 @@
         "FLamby",
         "One line description of project.",
         "Miscellaneous",
-    ),
+    )
 ]
 
 

diff --git a/flamby/benchmarks/benchmark_utils.py b/flamby/benchmarks/benchmark_utils.py
@@ -568,9 +568,7 @@ def ensemble_perf_from_predictions(
     return ensemble_perf
 
 
-def set_dataset_specific_config(
-    dataset_name, compute_ensemble_perf=False, use_gpu=True
-):
+def set_dataset_specific_config(dataset_name, compute_ensemble_perf=False, use_gpu=True):
     """_summary_
 
     Parameters

diff --git a/flamby/benchmarks/conf.py b/flamby/benchmarks/conf.py
@@ -85,8 +85,7 @@ def get_dataset_args(
     for param in params:
         try:
             p = getattr(
-                __import__(f"flamby.datasets.{dataset_name}", fromlist=param),
-                param,
+                __import__(f"flamby.datasets.{dataset_name}", fromlist=param), param
             )
         except AttributeError:
             p = None

diff --git a/flamby/benchmarks/fed_benchmark.py b/flamby/benchmarks/fed_benchmark.py
@@ -464,10 +464,7 @@ def main(args_cli):
 
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--GPU",
-        type=int,
-        default=0,
-        help="GPU to run the training on (if available)",
+        "--GPU", type=int, default=0, help="GPU to run the training on (if available)"
     )
     parser.add_argument(
         "--cpu-only",
@@ -488,10 +485,7 @@ def main(args_cli):
         help="Do 0 round and 0 epoch to check if the script is working",
     )
     parser.add_argument(
-        "--workers",
-        type=int,
-        default=0,
-        help="Numbers of workers for the dataloader",
+        "--workers", type=int, default=0, help="Numbers of workers for the dataloader"
     )
     parser.add_argument(
         "--learning_rate",

diff --git a/flamby/create_dataset_config.py b/flamby/create_dataset_config.py
@@ -5,10 +5,7 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--path",
-        type=str,
-        help="The path where the dataset is located",
-        required=True,
+        "--path", type=str, help="The path where the dataset is located", required=True
     )
     parser.add_argument(
         "--dataset-name",

diff --git a/flamby/datasets/fed_camelyon16/dataset.py b/flamby/datasets/fed_camelyon16/dataset.py
@@ -86,7 +86,8 @@ def __init__(
         self.features_centers = []
         self.features_sets = []
         self.perms = {}
-        # We need this ist to be sorted for reproducibility but shuffled to avoid weirdness
+        # We need this list to be sorted for reproducibility but shuffled to
+        # avoid weirdness
         npys_list = sorted(self.tiles_dir.glob("*.npy"))
         random.seed(0)
         random.shuffle(npys_list)

diff --git a/flamby/datasets/fed_camelyon16/dataset_creation_scripts/download.py b/flamby/datasets/fed_camelyon16/dataset_creation_scripts/download.py
@@ -5,6 +5,7 @@
 import sys
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 from google_client import create_service
 from googleapiclient.errors import HttpError
@@ -65,12 +66,11 @@ def main(path_to_secret, output_folder, port=6006, debug=False):
             len(train_df.index) + len(test_df.index)
         )
         downloaded_images_status_file["Slide"] = None
-        downloaded_images_status_file.Slide.iloc[: len(train_df.index)] = train_df[
-            "name"
-        ]
-        downloaded_images_status_file.Slide.iloc[len(train_df.index) :] = test_df[
-            "name"
-        ]
+        total_size = len(train_df.index) + len(test_df.index)
+        train_idxs = np.arange(0, len(train_df.index))
+        test_idxs = np.arange(len(train_df.index), total_size)
+        downloaded_images_status_file.Slide.iloc[train_idxs] = train_df["name"]
+        downloaded_images_status_file.Slide.iloc[test_idxs] = test_df["name"]
         downloaded_images_status_file.to_csv(
             downloaded_images_status_file_path, index=False
         )
@@ -92,7 +92,9 @@ def main(path_to_secret, output_folder, port=6006, debug=False):
         port=port,
     )
     regex = "(?<=https://drive.google.com/file/d/)[a-zA-Z0-9]+"
-    # Resourcekey is now mandatory (credit @Kris in: https://stackoverflow.com/questions/71343002/downloading-files-from-public-google-drive-in-python-scoping-issues)
+    # Resourcekey is now mandatory (credit @Kris in:
+    # https://stackoverflow.com/questions/71343002/
+    # downloading-files-from-public-google-drive-in-python-scoping-issues)
     regex_rkey = "(?<=resourcekey=).+"
     for current_df in [train_df, test_df]:
         for i in tqdm(range(len(current_df.index))):

diff --git a/flamby/datasets/fed_camelyon16/dataset_creation_scripts/tiling_slides.py b/flamby/datasets/fed_camelyon16/dataset_creation_scripts/tiling_slides.py
@@ -52,9 +52,7 @@ def __len__(self):
 
     def __getitem__(self, idx):
         pil_image = self.slide.read_region(
-            self.coords[idx].astype("int_"),
-            self.level,
-            (self.tile_size, self.tile_size),
+            self.coords[idx].astype("int_"), self.level, (self.tile_size, self.tile_size)
         ).convert("RGB")
         if self.transform is not None:
             pil_image = self.transform(pil_image)

diff --git a/flamby/datasets/fed_camelyon16/model.py b/flamby/datasets/fed_camelyon16/model.py
@@ -10,15 +10,13 @@ class Baseline(nn.Module):
     def __init__(self):
         super(Baseline, self).__init__()
         # As per the article
-        self.O = 2048  # Original dimension of the input embeddings
+        self.Od = 2048  # Original dimension of the input embeddings
         self.M = 128  # New dimension of the input embedding
 
         self.L = 128  # Dimension of the new features after query and value projections
         self.K = 1000  # Number of elements in each bag
 
-        self.feature_extractor_part1 = nn.Sequential(
-            nn.Linear(self.O, self.M),
-        )
+        self.feature_extractor_part1 = nn.Sequential(nn.Linear(self.Od, self.M))
         # The Gated Attention using tanh and sigmoid from Eq 9
         # from https://arxiv.org/abs/1802.04712
 

diff --git a/flamby/datasets/fed_dummy_dataset.py b/flamby/datasets/fed_dummy_dataset.py
@@ -24,8 +24,9 @@ def __len__(self):
         return self.size
 
     def __getitem__(self, idx):
-        return torch.rand(3, 224, 224).to(self.X_dtype), torch.randint(0, 2, (1,)).to(
-            self.y_dtype
+        return (
+            torch.rand(3, 224, 224).to(self.X_dtype),
+            torch.randint(0, 2, (1,)).to(self.y_dtype),
         )
 
 
@@ -53,10 +54,7 @@ def forward(self, X):
     m = Baseline()
     lo = BaselineLoss()
     dl = DataLoader(
-        FedDummyDataset(center=1, train=True),
-        batch_size=32,
-        shuffle=True,
-        num_workers=0,
+        FedDummyDataset(center=1, train=True), batch_size=32, shuffle=True, num_workers=0
     )
     it = iter(dl)
     X, y = next(it)

diff --git a/flamby/datasets/fed_heart_disease/dataset.py b/flamby/datasets/fed_heart_disease/dataset.py
@@ -73,12 +73,7 @@ def __init__(
         self.y_dtype = y_dtype
         self.debug = debug
 
-        self.centers_number = {
-            "cleveland": 0,
-            "hungarian": 1,
-            "switzerland": 2,
-            "va": 3,
-        }
+        self.centers_number = {"cleveland": 0, "hungarian": 1, "switzerland": 2, "va": 3}
 
         self.features = pd.DataFrame()
         self.labels = pd.DataFrame()
@@ -165,9 +160,7 @@ def __init__(
             }
 
         # We finally broadcast the means and stds over all datasets
-        self.mean_of_features = torch.zeros(
-            (len(self.features), 13), dtype=self.X_dtype
-        )
+        self.mean_of_features = torch.zeros((len(self.features), 13), dtype=self.X_dtype)
         self.std_of_features = torch.ones((len(self.features), 13), dtype=self.X_dtype)
         for i in range(self.mean_of_features.shape[0]):
             self.mean_of_features[i] = self.centers_stats[self.centers[i]]["mean"]
@@ -177,8 +170,7 @@ def __init__(
         to_select = [(self.sets[idx] == "train") for idx, _ in enumerate(self.features)]
         features_train = [fp for idx, fp in enumerate(self.features) if to_select[idx]]
         features_tensor_train = torch.cat(
-            [features_train[i][None, :] for i in range(len(features_train))],
-            axis=0,
+            [features_train[i][None, :] for i in range(len(features_train))], axis=0
         )
         self.mean_of_features_pooled_train = features_tensor_train.mean(axis=0)
         self.std_of_features_pooled_train = features_tensor_train.std(axis=0)

diff --git a/flamby/datasets/fed_heart_disease/dataset_creation_scripts/download.py b/flamby/datasets/fed_heart_disease/dataset_creation_scripts/download.py
@@ -20,12 +20,9 @@ def main(output_folder, debug=False):
 
     # location of the files in the UCI archive
     accept_license(
-        "https://archive-beta.ics.uci.edu/ml/datasets/heart+disease",
-        "fed_heart_disease",
-    )
-    base_url = (
-        "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/"
+        "https://archive-beta.ics.uci.edu/ml/datasets/heart+disease", "fed_heart_disease"
     )
+    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/"
     centers = ["cleveland", "hungarian", "switzerland", "va"]
     md5_hashes = [
         "2d91a8ff69cfd9616aa47b59d6f843db",
@@ -69,9 +66,7 @@ def main(output_folder, debug=False):
         sys.exit()
 
     # get status of download
-    downloaded_status_file_path = os.path.join(
-        output_folder, "download_status_file.csv"
-    )
+    downloaded_status_file_path = os.path.join(output_folder, "download_status_file.csv")
     if not (os.path.exists(downloaded_status_file_path)):
         downloaded_status_file = pd.DataFrame()
         downloaded_status_file["Status"] = ["Not found"] * 4

diff --git a/flamby/datasets/fed_heart_disease/metric.py b/flamby/datasets/fed_heart_disease/metric.py
@@ -1,5 +1,4 @@
 import numpy as np
-from sklearn.metrics import roc_auc_score
 
 
 def metric(y_true, y_pred):

diff --git a/flamby/datasets/fed_isic2019/benchmark.py b/flamby/datasets/fed_isic2019/benchmark.py
@@ -22,14 +22,7 @@
 
 
 def train_model(
-    model,
-    optimizer,
-    scheduler,
-    dataloaders,
-    dataset_sizes,
-    device,
-    lossfunc,
-    num_epochs,
+    model, optimizer, scheduler, dataloaders, dataset_sizes, device, lossfunc, num_epochs
 ):
     """Training function
     Parameters
@@ -224,16 +217,10 @@ def main(args):
 
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--GPU",
-        type=int,
-        default=0,
-        help="GPU to run the training on (if available)",
+        "--GPU", type=int, default=0, help="GPU to run the training on (if available)"
     )
     parser.add_argument(
-        "--workers",
-        type=int,
-        default=4,
-        help="Numbers of workers for the dataloader",
+        "--workers", type=int, default=4, help="Numbers of workers for the dataloader"
     )
     args = parser.parse_args()
 
@@ -243,10 +230,7 @@ def main(args):
 
     sz = 200
     test_aug = albumentations.Compose(
-        [
-            albumentations.CenterCrop(sz, sz),
-            albumentations.Normalize(always_apply=True),
-        ]
+        [albumentations.CenterCrop(sz, sz), albumentations.Normalize(always_apply=True)]
     )
     test_dataset = dataset.FedIsic2019(train=False, pooled=True)
     test_dataloader = torch.utils.data.DataLoader(

diff --git a/flamby/datasets/fed_isic2019/dataset_creation_scripts/download_isic.py b/flamby/datasets/fed_isic2019/dataset_creation_scripts/download_isic.py
@@ -76,23 +76,17 @@
 for i, row in ISIC_2019_Training_Metadata.iterrows():
     if pd.isnull(row["lesion_id"]):
         image = row["image"]
-        os.system(
-            "rm " + data_directory + "/ISIC_2019_Training_Input/" + image + ".jpg"
-        )
+        os.system("rm " + data_directory + "/ISIC_2019_Training_Input/" + image + ".jpg")
         if image != ISIC_2019_Training_GroundTruth["image"][i]:
             print("Mismatch between Metadata and Ground Truth")
         ISIC_2019_Training_GroundTruth = ISIC_2019_Training_GroundTruth.drop(i)
         ISIC_2019_Training_Metadata = ISIC_2019_Training_Metadata.drop(i)
 
 # generating dataset field from lesion_id field in the metadata dataframe
-ISIC_2019_Training_Metadata["dataset"] = ISIC_2019_Training_Metadata["lesion_id"].str[
-    :4
-]
+ISIC_2019_Training_Metadata["dataset"] = ISIC_2019_Training_Metadata["lesion_id"].str[:4]
 
 # join with HAM10000 metadata in order to expand the HAM datacenters
-result = pd.merge(
-    ISIC_2019_Training_Metadata, HAM10000_metadata, how="left", on="image"
-)
+result = pd.merge(ISIC_2019_Training_Metadata, HAM10000_metadata, how="left", on="image")
 result["dataset"] = result["dataset_x"] + result["dataset_y"].astype(str)
 result.drop(["dataset_x", "dataset_y", "lesion_id"], axis=1, inplace=True)
 

diff --git a/flamby/datasets/fed_isic2019/heterogeneity_pic.py b/flamby/datasets/fed_isic2019/heterogeneity_pic.py
@@ -42,22 +42,13 @@ def forward(self, image):
 
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--GPU",
-        type=int,
-        default=0,
-        help="GPU to run the training on (if available)",
+        "--GPU", type=int, default=0, help="GPU to run the training on (if available)"
     )
     parser.add_argument(
-        "--workers",
-        type=int,
-        default=0,
-        help="Numbers of workers for the dataloader",
+        "--workers", type=int, default=0, help="Numbers of workers for the dataloader"
     )
     parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="The seed for the UMPA and dataloading",
+        "--seed", type=int, default=42, help="The seed for the UMPA and dataloading"
     )
     args = parser.parse_args()
     np.random.seed(args.seed)

diff --git a/flamby/datasets/fed_ixi/dataset_creation_scripts/download.py b/flamby/datasets/fed_ixi/dataset_creation_scripts/download.py
@@ -21,10 +21,16 @@ def dl_ixi_tiny(output_folder, debug=False):
             The folder where to download the dataset.
     """
     print(
-        "The IXI dataset is made available under the Creative Commons CC BY-SA 3.0 license.\n\
-    If you use the IXI data please acknowledge the source of the IXI data, e.g. the following website: https://brain-development.org/ixi-dataset/\n\
-    IXI Tiny is derived from the same source. Acknowledge the following reference on TorchIO : https://torchio.readthedocs.io/datasets.html#ixitiny\n\
-    Pérez-García F, Sparks R, Ourselin S. TorchIO: a Python library for efficient loading, preprocessing, augmentation and patch-based sampling of medical images in deep learning. arXiv:2003.04696 [cs, eess, stat]. 2020. https://doi.org/10.48550/arXiv.2003.04696"
+        "The IXI dataset is made available under the Creative Commons CC BY-SA \
+            3.0 license.\n\
+    If you use the IXI data please acknowledge the source of the IXI data, e.g.\
+    the following website: https://brain-development.org/ixi-dataset/\
+    IXI Tiny is derived from the same source. Acknowledge the following reference\
+    on TorchIO : https://torchio.readthedocs.io/datasets.html#ixitiny\
+    Pérez-García F, Sparks R, Ourselin S. TorchIO: a Python library for \
+    efficient loading, preprocessing, augmentation and patch-based sampling \
+    of medical images in deep learning. arXiv:2003.04696 [cs, eess, stat]. \
+    2020. https://doi.org/10.48550/arXiv.2003.04696"
     )
     accept_license("https://brain-development.org/ixi-dataset/", "fed_ixi")
     os.makedirs(output_folder, exist_ok=True)