From 73ca7ab567b300d883af94fc35f444bc70bc60b7 Mon Sep 17 00:00:00 2001 From: Ali Shibli Date: Tue, 24 Sep 2024 15:02:07 +0200 Subject: [PATCH 01/17] add stratified sampling to training set --- environment.yaml | 3 ++- run.py | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/environment.yaml b/environment.yaml index 1c486cf8..a3d6e4d3 100644 --- a/environment.yaml +++ b/environment.yaml @@ -28,4 +28,5 @@ dependencies: - google-cloud-storage - omegaconf - pydataverse - - pytest \ No newline at end of file + - pytest + - pip install pyDataverse \ No newline at end of file diff --git a/run.py b/run.py index 4f20c1da..682caec3 100644 --- a/run.py +++ b/run.py @@ -10,6 +10,7 @@ import torch from torch.utils.data import DataLoader, Subset from torch.utils.data.distributed import DistributedSampler +from sklearn.model_selection import StratifiedShuffleSplit import foundation_models import datasets @@ -86,6 +87,8 @@ help="Percentage of the dataset to use as a decimal, \ (e.g., 0.1 for 10%). Default -1 to use the entire dataset.") +parser.add_argument("--stratified_sampling", action="store_true", help="use stratified sampling for dataset splitting") + parser.add_argument("--seed", type=int, help="random seed") parser.add_argument("--num_workers", type=int, @@ -246,13 +249,28 @@ def main(): ) ) collate_fn = get_collate_fn(cfg) + # training if not cfg.eval_dir: if 0 < cfg.limited_label < 1: - indices = random.sample(range(len(train_dataset)), int(len(train_dataset)*cfg.limited_label)) - train_dataset = Subset(train_dataset, indices) - perc = cfg.limited_label*100 - logger.info(f"Created a subset of the train dataset, with {perc}% of the labels available") + if cfg.stratified_sampling: + # Use stratified sampling with the limited label + stratified_split = StratifiedShuffleSplit(n_splits=1, train_size=cfg.limited_label, random_state=cfg.seed) + + labels = train_dataset.targets # Adjust depending on how labels are stored in your dataset + for train_idx, _ in stratified_split.split(torch.zeros(len(labels)), labels): + train_dataset = Subset(train_dataset, train_idx) + + perc = cfg.limited_label * 100 + logger.info(f"Created a stratified subset of the train dataset, with {perc}% of the labels available.") + + else: + # Randomly sample a percentage of the dataset + indices = random.sample(range(len(train_dataset)), int(len(train_dataset) * cfg.limited_label)) + train_dataset = Subset(train_dataset, indices) + + perc = cfg.limited_label * 100 + logger.info(f"Created a random subset of the train dataset, with {perc}% of the labels available.") else: logger.info(f"The entire train dataset will be used.") From 00ce22e9caa4e48d8aa4a8ccaa869769ea4c8fa2 Mon Sep 17 00:00:00 2001 From: Ali Shibli Date: Thu, 26 Sep 2024 13:51:47 +0200 Subject: [PATCH 02/17] add function in hls class --- datasets/hlsburnscars.py | 23 +++++++++++++++++++++++ run.py | 3 ++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/datasets/hlsburnscars.py b/datasets/hlsburnscars.py index 91dfb579..ffaf7da2 100644 --- a/datasets/hlsburnscars.py +++ b/datasets/hlsburnscars.py @@ -36,6 +36,29 @@ def __init__(self, cfg, split, is_train=True): # T.Resize((self.height, self.height), antialias=False), T.Normalize(mean=self.data_mean['optical'], std=self.data_std['optical']) ]) + self.labels = self._generate_image_labels() + + def _generate_image_labels(self): + """ Generate image-level labels for stratification based on the median threshold of burned pixels. """ + burned_ratios = [] + + # Step 1: Calculate the burned ratio for each image + for mask_path in self.target_list: + with rasterio.open(mask_path) as src: + mask = src.read(1) + total_pixels = mask.size + burned_pixels = np.sum(mask == 1) + burned_ratio = burned_pixels / total_pixels + burned_ratios.append(burned_ratio) + + # Step 2: Calculate the median burned ratio + median_burned_ratio = np.median(burned_ratios) + + # Step 3: Generate 0/1 labels based on whether burned ratio is above or below the median + labels = [(1 if ratio > median_burned_ratio else 0) for ratio in burned_ratios] + + return labels + def __len__(self): return len(self.image_list) diff --git a/run.py b/run.py index 682caec3..d3dd5eb0 100644 --- a/run.py +++ b/run.py @@ -257,7 +257,8 @@ def main(): # Use stratified sampling with the limited label stratified_split = StratifiedShuffleSplit(n_splits=1, train_size=cfg.limited_label, random_state=cfg.seed) - labels = train_dataset.targets # Adjust depending on how labels are stored in your dataset + labels = train_dataset.labels # Adjust depending on how labels are stored in your dataset + for train_idx, _ in stratified_split.split(torch.zeros(len(labels)), labels): train_dataset = Subset(train_dataset, train_idx) From 90e8c3ca8c46339332d54131aa27b229e7668092 Mon Sep 17 00:00:00 2001 From: Ali Shibli Date: Wed, 2 Oct 2024 19:41:10 +0200 Subject: [PATCH 03/17] add geofmsubset class --- pangaea/datasets/base.py | 31 ++++++++++++++++++++++++++++++- pangaea/datasets/hlsburnscars.py | 18 +----------------- pangaea/run.py | 16 ++++++++-------- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/pangaea/datasets/base.py b/pangaea/datasets/base.py index b1fda6d1..0470d143 100644 --- a/pangaea/datasets/base.py +++ b/pangaea/datasets/base.py @@ -1,5 +1,5 @@ import torch -from torch.utils.data import Dataset +from torch.utils.data import Dataset, Subset import os class GeoFMDataset(Dataset): @@ -115,3 +115,32 @@ def download(self) -> None: NotImplementedError: raise if the method is not implemented """ raise NotImplementedError + + +class GeoFMSubset(Subset): + """Custom subset class that retains dataset attributes.""" + + def __init__(self, dataset, indices): + super().__init__(dataset, indices) + + # Copy relevant attributes from the original dataset + self.dataset_name = getattr(dataset, 'dataset_name', None) + self.root_path = getattr(dataset, 'root_path', None) + self.auto_download = getattr(dataset, 'auto_download', None) + self.img_size = getattr(dataset, 'img_size', None) + self.multi_temporal = getattr(dataset, 'multi_temporal', None) + self.multi_modal = getattr(dataset, 'multi_modal', None) + self.ignore_index = getattr(dataset, 'ignore_index', None) + self.num_classes = getattr(dataset, 'num_classes', None) + self.classes = getattr(dataset, 'classes', None) + self.distribution = getattr(dataset, 'distribution', None) + self.bands = getattr(dataset, 'bands', None) + self.data_mean = getattr(dataset, 'data_mean', None) + self.data_std = getattr(dataset, 'data_std', None) + self.data_min = getattr(dataset, 'data_min', None) + self.data_max = getattr(dataset, 'data_max', None) + + def filter_by_indices(self, indices): + """Apply filtering by indices directly in this subset.""" + return GeoFMSubset(self.dataset, indices) + diff --git a/pangaea/datasets/hlsburnscars.py b/pangaea/datasets/hlsburnscars.py index c2a2e0b9..6a5fee97 100644 --- a/pangaea/datasets/hlsburnscars.py +++ b/pangaea/datasets/hlsburnscars.py @@ -143,7 +143,6 @@ def __len__(self): return len(self.image_list) def __getitem__(self, index): - image = tiff.imread(self.image_list[index]) image = image.astype(np.float32) # Convert to float32 image = torch.from_numpy(image).permute(2, 0, 1) @@ -155,7 +154,6 @@ def __getitem__(self, index): invalid_mask = image == 9999 image[invalid_mask] = 0 - output = { 'image': { 'optical': image, @@ -163,23 +161,9 @@ def __getitem__(self, index): 'target': target, 'metadata': {} } - - return output - - @staticmethod - def get_stratified_train_val_split(all_files) -> Tuple[Sequence[int], Sequence[int]]: + return output - # Fixed stratified sample to split data into train/val. - # This keeps 90% of datapoints belonging to an individual event in the training set and puts the remaining 10% in the validation set. - disaster_names = list( - map(lambda path: pathlib.Path(path).name.split("_")[0], all_files)) - train_idxs, val_idxs = train_test_split(np.arange(len(all_files)), - test_size=0.1, - random_state=23, - stratify=disaster_names) - return {"train": train_idxs, "val": val_idxs} - @staticmethod def download(self, silent=False): output_path = pathlib.Path(self.root_path) diff --git a/pangaea/run.py b/pangaea/run.py index 9312ff53..326b6f0e 100644 --- a/pangaea/run.py +++ b/pangaea/run.py @@ -26,7 +26,7 @@ seed_worker, ) from pangaea.utils.stratification import stratify_single_dataset_indices - +from pangaea.datasets.base import GeoFMSubset def get_exp_name(hydra_config: HydraConf) -> str: """Create a unique experiment name based on the choices made in the config. @@ -154,15 +154,15 @@ def main(cfg: DictConfig) -> None: preprocess, dataset=val_dataset, encoder=encoder ) if 0 < cfg.limited_label < 1: - n_train_samples = len(train_dataset) - indices = random.sample( - range(n_train_samples), int(n_train_samples * cfg.limited_label) - ) - # labeled_train_idx, _ = stratify_single_dataset_indices(train_dataset, num_classes=2, label_fraction=0.5, num_bins=3) - # indices = labeled_train_idx - train_dataset = Subset(train_dataset, indices) + # n_train_samples = len(train_dataset) + # indices = random.sample( + # range(n_train_samples), int(n_train_samples * cfg.limited_label) + # ) + indices, _ = stratify_single_dataset_indices(train_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3) + train_dataset = GeoFMSubset(train_dataset, indices) logger.info( f"Created a subset of the train dataset, with {cfg.limited_label * 100}% of the labels available" + f"Total number of patches used: {len(train_dataset)}" ) else: logger.info("The entire train dataset will be used.") From b3a5a1a93e1dd5862da4ecfe33ef1999c6c64176 Mon Sep 17 00:00:00 2001 From: Ali Shibli Date: Wed, 2 Oct 2024 20:16:22 +0200 Subject: [PATCH 04/17] add val stratification and logging info --- pangaea/datasets/base.py | 1 + pangaea/run.py | 14 +++++++++++--- pangaea/utils/stratification.py | 16 +++++++++------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pangaea/datasets/base.py b/pangaea/datasets/base.py index 0470d143..3cd3388e 100644 --- a/pangaea/datasets/base.py +++ b/pangaea/datasets/base.py @@ -139,6 +139,7 @@ def __init__(self, dataset, indices): self.data_std = getattr(dataset, 'data_std', None) self.data_min = getattr(dataset, 'data_min', None) self.data_max = getattr(dataset, 'data_max', None) + self.split = getattr(dataset, 'split', None) def filter_by_indices(self, indices): """Apply filtering by indices directly in this subset.""" diff --git a/pangaea/run.py b/pangaea/run.py index 326b6f0e..855f87c6 100644 --- a/pangaea/run.py +++ b/pangaea/run.py @@ -158,11 +158,19 @@ def main(cfg: DictConfig) -> None: # indices = random.sample( # range(n_train_samples), int(n_train_samples * cfg.limited_label) # ) - indices, _ = stratify_single_dataset_indices(train_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3) + + # Stratify train dataset + indices, _ = stratify_single_dataset_indices(train_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3, logger=logger) train_dataset = GeoFMSubset(train_dataset, indices) + + # Stratify validation dataset + indices, _ = stratify_single_dataset_indices(val_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3, logger=logger) + val_dataset = GeoFMSubset(val_dataset, indices) + logger.info( - f"Created a subset of the train dataset, with {cfg.limited_label * 100}% of the labels available" - f"Total number of patches used: {len(train_dataset)}" + f"Created a subset of the train and val dataset, with {cfg.limited_label * 100}% of the labels available\n" + f"Total number of train patches: {len(train_dataset)}\n" + f"Total number of validation patches: {len(val_dataset)}\n" ) else: logger.info("The entire train dataset will be used.") diff --git a/pangaea/utils/stratification.py b/pangaea/utils/stratification.py index 3a41722b..b88146d9 100644 --- a/pangaea/utils/stratification.py +++ b/pangaea/utils/stratification.py @@ -17,28 +17,30 @@ def calculate_class_distributions(dataset, num_classes): return np.array(class_distributions) # Function to bin class distributions with a progress bar -def bin_class_distributions(class_distributions, num_bins=3): - print("Binning class distributions...") +def bin_class_distributions(class_distributions, num_bins=3, logger=None): + + logger.info("Binning class distributions...") # Adding a progress bar for binning class distributions binned_distributions = np.digitize(class_distributions, np.linspace(0, 1, num_bins+1)) - 1 return binned_distributions # Function to perform stratification and return only the indices -def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3): - print("Starting stratification...") +def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None): + + logger.info("Starting stratification...") # Step 1: Calculate class distributions with progress tracking class_distributions = calculate_class_distributions(dataset, num_classes) # Step 2: Bin the class distributions - binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins) + binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins, logger=logger) # Step 3: Combine the bins to use for stratification combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions) # Step 4: Select a subset of labeled data with progress tracking num_labeled = int(len(dataset) * label_fraction) - print(f"Selecting {label_fraction * 100:.0f}% labeled data from {len(dataset)} samples...") + logger.info(f"Selecting {label_fraction * 100:.0f}% labeled data from {len(dataset)} samples...") # Shuffle and take the labeled part of the dataset based on the binned distributions indices = np.arange(len(dataset)) @@ -49,5 +51,5 @@ def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, nu labeled_idx = sorted_indices[:num_labeled] unlabeled_idx = sorted_indices[num_labeled:] - print("Stratification complete.") + logger.info("Stratification complete.") return labeled_idx, unlabeled_idx From f709e357c871b765009595e782b8e4dc85a4ded8 Mon Sep 17 00:00:00 2001 From: Ali Shibli Date: Wed, 2 Oct 2024 20:46:48 +0200 Subject: [PATCH 05/17] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9c2da1b6..ca1d74bd 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ We provide several ways to install the dependencies. ## 🏋️ Training -To run experiments, please refer to `configs/train.yaml`. In it, in addition to some basic info about training (e.g. `finetune` for fine-tuning also the encoder, `limited_label` to train the model on a subset of labels, `num_workers`, `batch_size` and so on), there are 5 different basic configs: +To run experiments, please refer to `configs/train.yaml`. In it, in addition to some basic info about training (e.g. `finetune` for fine-tuning also the encoder, `limited_label` to train the model on a stratified subset of labels, `num_workers`, `batch_size` and so on), there are 5 different basic configs: - `dataset`: Information of downstream datasets such as image size, band_statistics, classes etc. - `decoder`: Downstream task decoder fine-tuning related parameters, like the type of architecture (e.g. UPerNet), which multi-temporal strategy to use, and other related hparams (e.g. nr of channels) - `encoder`: GFM encoder related parameters. `output_layers` is used for which layers are used for Upernet decoder. From f396083f2593804722bd15deaf2b07908948aa93 Mon Sep 17 00:00:00 2001 From: yurujaja Date: Mon, 7 Oct 2024 14:08:37 +0200 Subject: [PATCH 06/17] re-add hlsburn train-val split --- pangaea/datasets/base.py | 1 + pangaea/datasets/hlsburnscars.py | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/pangaea/datasets/base.py b/pangaea/datasets/base.py index 3cd3388e..f611b5c1 100644 --- a/pangaea/datasets/base.py +++ b/pangaea/datasets/base.py @@ -127,6 +127,7 @@ def __init__(self, dataset, indices): self.dataset_name = getattr(dataset, 'dataset_name', None) self.root_path = getattr(dataset, 'root_path', None) self.auto_download = getattr(dataset, 'auto_download', None) + self.download_url = getattr(dataset, 'download_url', None) self.img_size = getattr(dataset, 'img_size', None) self.multi_temporal = getattr(dataset, 'multi_temporal', None) self.multi_modal = getattr(dataset, 'multi_modal', None) diff --git a/pangaea/datasets/hlsburnscars.py b/pangaea/datasets/hlsburnscars.py index 6a5fee97..8774448d 100644 --- a/pangaea/datasets/hlsburnscars.py +++ b/pangaea/datasets/hlsburnscars.py @@ -2,21 +2,17 @@ import time import torch import numpy as np -# import rasterio import tifffile as tiff -from typing import Sequence, Dict, Any, Union, Literal, Tuple +from typing import Sequence, Tuple from sklearn.model_selection import train_test_split from glob import glob import torch -import torchvision.transforms.functional as TF -import torchvision.transforms as T import pathlib import urllib import tarfile -# from utils.registry import DATASET_REGISTRY from pangaea.datasets.utils import DownloadProgressBar from pangaea.datasets.base import GeoFMDataset @@ -164,6 +160,20 @@ def __getitem__(self, index): return output + + @staticmethod + def get_stratified_train_val_split(all_files) -> Tuple[Sequence[int], Sequence[int]]: + + # Fixed stratified sample to split data into train/val. + # This keeps 90% of datapoints belonging to an individual event in the training set and puts the remaining 10% in the validation set. + disaster_names = list( + map(lambda path: pathlib.Path(path).name.split("_")[0], all_files)) + train_idxs, val_idxs = train_test_split(np.arange(len(all_files)), + test_size=0.1, + random_state=23, + stratify=disaster_names) + return {"train": train_idxs, "val": val_idxs} + @staticmethod def download(self, silent=False): output_path = pathlib.Path(self.root_path) @@ -195,6 +205,4 @@ def download(self, silent=False): tar.extractall(output_path) print("done.") - os.remove(output_path / temp_file_name) - - + os.remove(output_path / temp_file_name) \ No newline at end of file From 8080464ace3ad18858f35743ff9079cede5dc490 Mon Sep 17 00:00:00 2001 From: yurujaja Date: Mon, 7 Oct 2024 17:07:07 +0200 Subject: [PATCH 07/17] limited label for both train and val, random or stratified sampling --- README.md | 8 ++-- configs/train.yaml | 6 ++- pangaea/datasets/hlsburnscars.py | 15 +------ pangaea/run.py | 40 ++++++++----------- .../{stratification.py => subset_sampler.py} | 39 +++++++++++------- 5 files changed, 53 insertions(+), 55 deletions(-) rename pangaea/utils/{stratification.py => subset_sampler.py} (68%) diff --git a/README.md b/README.md index ca1d74bd..7bd90efc 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ We provide several ways to install the dependencies. ## 🏋️ Training -To run experiments, please refer to `configs/train.yaml`. In it, in addition to some basic info about training (e.g. `finetune` for fine-tuning also the encoder, `limited_label` to train the model on a stratified subset of labels, `num_workers`, `batch_size` and so on), there are 5 different basic configs: +To run experiments, please refer to `configs/train.yaml`. In it, in addition to some basic info about training (e.g. `finetune` for fine-tuning also the encoder, `limited_label_train` to train the model on a stratified subset of labels, `num_workers`, `batch_size` and so on), there are 5 different basic configs: - `dataset`: Information of downstream datasets such as image size, band_statistics, classes etc. - `decoder`: Downstream task decoder fine-tuning related parameters, like the type of architecture (e.g. UPerNet), which multi-temporal strategy to use, and other related hparams (e.g. nr of channels) - `encoder`: GFM encoder related parameters. `output_layers` is used for which layers are used for Upernet decoder. @@ -136,7 +136,7 @@ torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \ task=segmentation ``` -If you want to overwrite some parameters (e.g. turn off wandbe, and changing batch size and the path to the dataset): +If you want to overwrite some parameters (e.g. turn off wandbe, change batch size and the path to the dataset, and use 50% stratified sampled subset for training): ``` torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \ --config-name=train \ @@ -148,7 +148,9 @@ torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \ task=segmentation \ dataset.root_path= /path/to/the/dataset/hlsburnscars \ batch_size=16 \ - use_wandb=False + use_wandb=False \ + limited_label_train=0.5 \ + limited_label_strategy=stratified ``` #### Multi-Temporal Semantic Segmentation diff --git a/configs/train.yaml b/configs/train.yaml index 5e97e1ae..1c9e7ade 100644 --- a/configs/train.yaml +++ b/configs/train.yaml @@ -12,7 +12,11 @@ batch_size: 32 # EXPERIMENT finetune: false ckpt_dir: null -limited_label: 1 +limited_label_train: 1 +limited_label_val: 1 +limited_label_strategy: stratified # stratified, random +stratification_bins: 3 # number of bins for stratified sampling, only for stratified + defaults: - task: ??? diff --git a/pangaea/datasets/hlsburnscars.py b/pangaea/datasets/hlsburnscars.py index 8774448d..0678660e 100644 --- a/pangaea/datasets/hlsburnscars.py +++ b/pangaea/datasets/hlsburnscars.py @@ -16,7 +16,7 @@ from pangaea.datasets.utils import DownloadProgressBar from pangaea.datasets.base import GeoFMDataset -# @DATASET_REGISTRY.register() + class HLSBurnScars(GeoFMDataset): def __init__( self, @@ -161,19 +161,6 @@ def __getitem__(self, index): return output - @staticmethod - def get_stratified_train_val_split(all_files) -> Tuple[Sequence[int], Sequence[int]]: - - # Fixed stratified sample to split data into train/val. - # This keeps 90% of datapoints belonging to an individual event in the training set and puts the remaining 10% in the validation set. - disaster_names = list( - map(lambda path: pathlib.Path(path).name.split("_")[0], all_files)) - train_idxs, val_idxs = train_test_split(np.arange(len(all_files)), - test_size=0.1, - random_state=23, - stratify=disaster_names) - return {"train": train_idxs, "val": val_idxs} - @staticmethod def download(self, silent=False): output_path = pathlib.Path(self.root_path) diff --git a/pangaea/run.py b/pangaea/run.py index 855f87c6..213b6d49 100644 --- a/pangaea/run.py +++ b/pangaea/run.py @@ -10,7 +10,7 @@ from hydra.core.hydra_config import HydraConfig from hydra.utils import instantiate from omegaconf import DictConfig, OmegaConf -from torch.utils.data import DataLoader, Dataset, Subset +from torch.utils.data import DataLoader, Dataset from torch.utils.data.distributed import DistributedSampler from pangaea.decoders.base import Decoder @@ -25,9 +25,10 @@ get_generator, seed_worker, ) -from pangaea.utils.stratification import stratify_single_dataset_indices +from pangaea.utils.subset_sampler import get_subset_indices from pangaea.datasets.base import GeoFMSubset + def get_exp_name(hydra_config: HydraConf) -> str: """Create a unique experiment name based on the choices made in the config. @@ -140,11 +141,6 @@ def main(cfg: DictConfig) -> None: # training if train_run: - # Step 1: Stratify the dataset to get indices - # labeled_train_idx, _ = stratify_single_dataset_indices(train_dataset, num_classes=2, label_fraction=0.5, num_bins=3) - # Step 2: Use Subset to filter the dataset with stratified indices - # train_dataset = Subset(train_dataset, labeled_train_idx) - for preprocess in cfg.preprocessing.train: train_dataset: Dataset = instantiate( preprocess, dataset=train_dataset, encoder=encoder @@ -153,27 +149,25 @@ def main(cfg: DictConfig) -> None: val_dataset: Dataset = instantiate( preprocess, dataset=val_dataset, encoder=encoder ) - if 0 < cfg.limited_label < 1: - # n_train_samples = len(train_dataset) - # indices = random.sample( - # range(n_train_samples), int(n_train_samples * cfg.limited_label) - # ) - - # Stratify train dataset - indices, _ = stratify_single_dataset_indices(train_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3, logger=logger) - train_dataset = GeoFMSubset(train_dataset, indices) - # Stratify validation dataset - indices, _ = stratify_single_dataset_indices(val_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3, logger=logger) - val_dataset = GeoFMSubset(val_dataset, indices) + if 0 < cfg.limited_label_train < 1: + indices = get_subset_indices( + train_dataset, strategy=cfg.limited_label_strategy, + label_fraction=cfg.limited_label_train, num_bins=cfg.stratification_bins, logger=logger + ) + train_dataset = GeoFMSubset(train_dataset, indices) - logger.info( - f"Created a subset of the train and val dataset, with {cfg.limited_label * 100}% of the labels available\n" + if 0 < cfg.limited_label_val < 1: + indices = get_subset_indices( + val_dataset, strategy=cfg.limited_label_strategy, + label_fraction=cfg.limited_label_val, num_bins=cfg.stratification_bins, logger=logger + ) + val_dataset = GeoFMSubset(val_dataset, indices) + + logger.info( f"Total number of train patches: {len(train_dataset)}\n" f"Total number of validation patches: {len(val_dataset)}\n" ) - else: - logger.info("The entire train dataset will be used.") # get train val data loaders train_loader = DataLoader( diff --git a/pangaea/utils/stratification.py b/pangaea/utils/subset_sampler.py similarity index 68% rename from pangaea/utils/stratification.py rename to pangaea/utils/subset_sampler.py index b88146d9..ab6c2e59 100644 --- a/pangaea/utils/stratification.py +++ b/pangaea/utils/subset_sampler.py @@ -1,55 +1,66 @@ +import random from tqdm import tqdm import numpy as np -from torch.utils.data import Subset, DataLoader + # Function to calculate class distributions with a progress bar def calculate_class_distributions(dataset, num_classes): class_distributions = [] - + # Adding a progress bar for dataset processing - for idx in tqdm(range(len(dataset)), desc="Calculating Class Distributions"): + for idx in tqdm(range(len(dataset)), desc="Calculating class distributions per sample"): target = dataset[idx]['target'] total_pixels = target.numel() class_counts = [(target == i).sum().item() for i in range(num_classes)] class_ratios = [count / total_pixels for count in class_counts] class_distributions.append(class_ratios) - + return np.array(class_distributions) # Function to bin class distributions with a progress bar def bin_class_distributions(class_distributions, num_bins=3, logger=None): - logger.info("Binning class distributions...") + logger.info(f"Class distributions are being binned into {num_bins} categories") # Adding a progress bar for binning class distributions binned_distributions = np.digitize(class_distributions, np.linspace(0, 1, num_bins+1)) - 1 return binned_distributions # Function to perform stratification and return only the indices def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None): - - logger.info("Starting stratification...") # Step 1: Calculate class distributions with progress tracking class_distributions = calculate_class_distributions(dataset, num_classes) # Step 2: Bin the class distributions binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins, logger=logger) - + # Step 3: Combine the bins to use for stratification combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions) # Step 4: Select a subset of labeled data with progress tracking num_labeled = int(len(dataset) * label_fraction) - logger.info(f"Selecting {label_fraction * 100:.0f}% labeled data from {len(dataset)} samples...") - - # Shuffle and take the labeled part of the dataset based on the binned distributions - indices = np.arange(len(dataset)) - np.random.shuffle(indices) # Sort the indices based on combined bins to preserve class distribution sorted_indices = np.argsort(combined_bins) labeled_idx = sorted_indices[:num_labeled] unlabeled_idx = sorted_indices[num_labeled:] - logger.info("Stratification complete.") return labeled_idx, unlabeled_idx + + +# Function to get subset indices based on the strategy +def get_subset_indices(dataset, strategy="random", label_fraction=0.5, num_bins=3, logger=None): + logger.info( + f"Creat a subset of the {dataset.split} dataset using {strategy} strategy, with {label_fraction * 100}% of labels utilized." + ) + if strategy == "stratified": + indices, _ = stratify_single_dataset_indices( + dataset, num_classes=dataset.num_classes, label_fraction=label_fraction, num_bins=num_bins, logger=logger + ) + else: + n_samples = len(dataset) + indices = random.sample( + range(n_samples), int(n_samples * label_fraction) + ) + + return indices \ No newline at end of file From f25a4455796f6ba66e368432b9de2f6483834c34 Mon Sep 17 00:00:00 2001 From: Ali Shibli Date: Tue, 8 Oct 2024 14:16:06 +0200 Subject: [PATCH 08/17] add regression stratification --- configs/train.yaml | 2 +- pangaea/utils/subset_sampler.py | 69 +++++++++++++++++++++++++++------ 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/configs/train.yaml b/configs/train.yaml index 1c9e7ade..5f47d1ed 100644 --- a/configs/train.yaml +++ b/configs/train.yaml @@ -14,7 +14,7 @@ finetune: false ckpt_dir: null limited_label_train: 1 limited_label_val: 1 -limited_label_strategy: stratified # stratified, random +limited_label_strategy: stratified_classification # stratified_regression, random stratification_bins: 3 # number of bins for stratified sampling, only for stratified diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py index ab6c2e59..9d6290e1 100644 --- a/pangaea/utils/subset_sampler.py +++ b/pangaea/utils/subset_sampler.py @@ -3,7 +3,7 @@ import numpy as np -# Function to calculate class distributions with a progress bar +# Function to calculate class distributions for classification with a progress bar def calculate_class_distributions(dataset, num_classes): class_distributions = [] @@ -17,17 +17,41 @@ def calculate_class_distributions(dataset, num_classes): return np.array(class_distributions) + +# Function to calculate distribution metrics for regression +def calculate_regression_distributions(dataset): + distributions = [] + + # Adding a progress bar for dataset processing + for idx in tqdm(range(len(dataset)), desc="Calculating regression distributions per sample"): + target = dataset[idx]['target'] + mean_value = target.mean().item() # Example for mean; adjust as needed for other metrics + distributions.append(mean_value) + + return np.array(distributions) + + # Function to bin class distributions with a progress bar def bin_class_distributions(class_distributions, num_bins=3, logger=None): - logger.info(f"Class distributions are being binned into {num_bins} categories") # Adding a progress bar for binning class distributions binned_distributions = np.digitize(class_distributions, np.linspace(0, 1, num_bins+1)) - 1 return binned_distributions -# Function to perform stratification and return only the indices -def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None): +# Function to bin regression distributions with a progress bar +def bin_regression_distributions(regression_distributions, num_bins=3, logger=None): + logger.info(f"Regression distributions are being binned into {num_bins} categories") + # Define the range for binning based on minimum and maximum values in regression distributions + binned_distributions = np.digitize( + regression_distributions, + np.linspace(regression_distributions.min(), regression_distributions.max(), num_bins + 1) + ) - 1 + return binned_distributions + + +# Function to perform stratification for classification and return only the indices +def stratify_classification_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None): # Step 1: Calculate class distributions with progress tracking class_distributions = calculate_class_distributions(dataset, num_classes) @@ -48,19 +72,42 @@ def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, nu return labeled_idx, unlabeled_idx -# Function to get subset indices based on the strategy +# Function to perform stratification for regression and return only the indices +def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3, logger=None): + # Step 1: Calculate regression distributions with progress tracking + regression_distributions = calculate_regression_distributions(dataset) + + # Step 2: Bin the regression distributions + binned_distributions = bin_regression_distributions(regression_distributions, num_bins=num_bins, logger=logger) + + # Step 3: Sort the indices based on binned distributions for stratification + sorted_indices = np.argsort(binned_distributions) + + # Step 4: Select a subset of labeled data with progress tracking + num_labeled = int(len(dataset) * label_fraction) + labeled_idx = sorted_indices[:num_labeled] + unlabeled_idx = sorted_indices[num_labeled:] + + return labeled_idx, unlabeled_idx + + +# Function to get subset indices based on the strategy, supporting both classification and regression def get_subset_indices(dataset, strategy="random", label_fraction=0.5, num_bins=3, logger=None): logger.info( - f"Creat a subset of the {dataset.split} dataset using {strategy} strategy, with {label_fraction * 100}% of labels utilized." - ) - if strategy == "stratified": - indices, _ = stratify_single_dataset_indices( + f"Creating a subset of the {dataset.split} dataset using {strategy} strategy, with {label_fraction * 100}% of labels utilized." + ) + if strategy == "stratified_classification": + indices, _ = stratify_classification_dataset_indices( dataset, num_classes=dataset.num_classes, label_fraction=label_fraction, num_bins=num_bins, logger=logger ) - else: + elif strategy == "stratified_regression": + indices, _ = stratify_regression_dataset_indices( + dataset, label_fraction=label_fraction, num_bins=num_bins, logger=logger + ) + else: # Default to random sampling n_samples = len(dataset) indices = random.sample( range(n_samples), int(n_samples * label_fraction) ) - return indices \ No newline at end of file + return indices From 33053c2d4827b7f4df8ee41e2a8f9734ae8114ef Mon Sep 17 00:00:00 2001 From: Ritu Yadav <40523539+RituYadav92@users.noreply.github.com> Date: Wed, 9 Oct 2024 12:33:37 +0200 Subject: [PATCH 09/17] Updated "stratify_regression_dataset_indices" function to return fraction of labels from each bin Previous code: A fraction of labels were selected from the sorted values. Specifically, for biomass, it was selecting samples with the lowest biomass. --- pangaea/utils/subset_sampler.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py index 9d6290e1..86b6a008 100644 --- a/pangaea/utils/subset_sampler.py +++ b/pangaea/utils/subset_sampler.py @@ -25,7 +25,7 @@ def calculate_regression_distributions(dataset): # Adding a progress bar for dataset processing for idx in tqdm(range(len(dataset)), desc="Calculating regression distributions per sample"): target = dataset[idx]['target'] - mean_value = target.mean().item() # Example for mean; adjust as needed for other metrics + mean_value = target.mean().item() # Example for patch-wise mean; adjust as needed for other metrics distributions.append(mean_value) return np.array(distributions) @@ -79,16 +79,33 @@ def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3, # Step 2: Bin the regression distributions binned_distributions = bin_regression_distributions(regression_distributions, num_bins=num_bins, logger=logger) + + # Step 3: Prep a dictionary to hold indices for each bin + indices_per_bin = {i: [] for i in range(num_bins)} + + # Step 4: Populate the indices per bin + for index, bin_index in enumerate(binned_distributions): + if bin_index in indices_per_bin: + indices_per_bin[bin_index].append(index) - # Step 3: Sort the indices based on binned distributions for stratification - sorted_indices = np.argsort(binned_distributions) + # Step 5: Select fraction of indices from each bin + selected_idx = [] + for bin_index, indices in indices_per_bin.items(): + num_to_select = int(max(1, len(indices)*label_fraction) ) # To ensure at least one index is selected + selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) + other_idx = list(set(range(len(dataset))) - set(selected_idx)) - # Step 4: Select a subset of labeled data with progress tracking - num_labeled = int(len(dataset) * label_fraction) - labeled_idx = sorted_indices[:num_labeled] - unlabeled_idx = sorted_indices[num_labeled:] + return selected_idx, other_idx + + # # Step 3: Sort the indices based on binned distributions for stratification + # sorted_indices = np.argsort(binned_distributions) + + # # Step 4: Select a subset of labeled data with progress tracking + # num_labeled = int(len(dataset) * label_fraction) + # labeled_idx = sorted_indices[:num_labeled] + # unlabeled_idx = sorted_indices[num_labeled:] - return labeled_idx, unlabeled_idx + # return labeled_idx, unlabeled_idx # Function to get subset indices based on the strategy, supporting both classification and regression From 927eec65d6706f013c8e8d0df0d69fe1fc383c0e Mon Sep 17 00:00:00 2001 From: Ali Shibli Date: Thu, 10 Oct 2024 11:05:28 +0200 Subject: [PATCH 10/17] adding segmentation stratification --- pangaea/utils/subset_sampler.py | 52 +++++++++++++++++---------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py index 86b6a008..fca604b0 100644 --- a/pangaea/utils/subset_sampler.py +++ b/pangaea/utils/subset_sampler.py @@ -2,7 +2,6 @@ from tqdm import tqdm import numpy as np - # Function to calculate class distributions for classification with a progress bar def calculate_class_distributions(dataset, num_classes): class_distributions = [] @@ -50,26 +49,36 @@ def bin_regression_distributions(regression_distributions, num_bins=3, logger=No return binned_distributions -# Function to perform stratification for classification and return only the indices +# Updated function to perform stratification for classification and return only the indices, with even bin selection def stratify_classification_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None): # Step 1: Calculate class distributions with progress tracking class_distributions = calculate_class_distributions(dataset, num_classes) # Step 2: Bin the class distributions binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins, logger=logger) - - # Step 3: Combine the bins to use for stratification + + # Step 3: Prep a dictionary to hold indices for each bin combination + indices_per_bin = {} + + # Combine the bins for each class to create unique bin identifiers combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions) - # Step 4: Select a subset of labeled data with progress tracking - num_labeled = int(len(dataset) * label_fraction) + # Populate the dictionary with indices based on combined bin identifiers + for idx, bin_id in enumerate(combined_bins): + if bin_id not in indices_per_bin: + indices_per_bin[bin_id] = [] + indices_per_bin[bin_id].append(idx) - # Sort the indices based on combined bins to preserve class distribution - sorted_indices = np.argsort(combined_bins) - labeled_idx = sorted_indices[:num_labeled] - unlabeled_idx = sorted_indices[num_labeled:] + # Step 4: Select a proportion of indices from each bin + selected_idx = [] + for bin_id, indices in indices_per_bin.items(): + num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected + selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) - return labeled_idx, unlabeled_idx + # Step 5: Determine the remaining indices not selected + other_idx = list(set(range(len(dataset))) - set(selected_idx)) + + return selected_idx, other_idx # Function to perform stratification for regression and return only the indices @@ -87,25 +96,16 @@ def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3, for index, bin_index in enumerate(binned_distributions): if bin_index in indices_per_bin: indices_per_bin[bin_index].append(index) - + # Step 5: Select fraction of indices from each bin selected_idx = [] for bin_index, indices in indices_per_bin.items(): - num_to_select = int(max(1, len(indices)*label_fraction) ) # To ensure at least one index is selected - selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) + num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected + selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) + other_idx = list(set(range(len(dataset))) - set(selected_idx)) - - return selected_idx, other_idx - - # # Step 3: Sort the indices based on binned distributions for stratification - # sorted_indices = np.argsort(binned_distributions) - - # # Step 4: Select a subset of labeled data with progress tracking - # num_labeled = int(len(dataset) * label_fraction) - # labeled_idx = sorted_indices[:num_labeled] - # unlabeled_idx = sorted_indices[num_labeled:] - # return labeled_idx, unlabeled_idx + return selected_idx, other_idx # Function to get subset indices based on the strategy, supporting both classification and regression @@ -128,3 +128,5 @@ def get_subset_indices(dataset, strategy="random", label_fraction=0.5, num_bins= ) return indices + + From 04f783b4c6e8c6921c5f18151ff1b5d5f9375d0a Mon Sep 17 00:00:00 2001 From: yurujaja Date: Thu, 10 Oct 2024 13:23:04 +0200 Subject: [PATCH 11/17] deep copy ckpt --- pangaea/engine/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pangaea/engine/trainer.py b/pangaea/engine/trainer.py index da55a685..2a3f37c7 100644 --- a/pangaea/engine/trainer.py +++ b/pangaea/engine/trainer.py @@ -1,3 +1,4 @@ +import copy import logging import operator import os @@ -193,7 +194,7 @@ def get_checkpoint(self, epoch: int) -> dict[str, dict | int]: "scaler": self.scaler.state_dict(), "epoch": epoch, } - return checkpoint + return copy.deepcopy(checkpoint) def save_model( self, From b3b13784f64ad6deeeb62918942278de1b3b2627 Mon Sep 17 00:00:00 2001 From: Ritu Yadav <40523539+RituYadav92@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:45:23 +0200 Subject: [PATCH 12/17] synched the steps between classification and regression --- pangaea/utils/subset_sampler.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py index fca604b0..7256dc9d 100644 --- a/pangaea/utils/subset_sampler.py +++ b/pangaea/utils/subset_sampler.py @@ -63,19 +63,19 @@ def stratify_classification_dataset_indices(dataset, num_classes, label_fraction # Combine the bins for each class to create unique bin identifiers combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions) - # Populate the dictionary with indices based on combined bin identifiers + # Step 4: Populate the dictionary with indices based on combined bin identifiers for idx, bin_id in enumerate(combined_bins): if bin_id not in indices_per_bin: indices_per_bin[bin_id] = [] indices_per_bin[bin_id].append(idx) - # Step 4: Select a proportion of indices from each bin + # Step 5: Select a fraction of indices from each bin selected_idx = [] for bin_id, indices in indices_per_bin.items(): num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) - # Step 5: Determine the remaining indices not selected + # Step 6: List the remaining unselected indices other_idx = list(set(range(len(dataset))) - set(selected_idx)) return selected_idx, other_idx @@ -97,12 +97,13 @@ def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3, if bin_index in indices_per_bin: indices_per_bin[bin_index].append(index) - # Step 5: Select fraction of indices from each bin + # Step 5: Select a fraction of indices from each bin selected_idx = [] for bin_index, indices in indices_per_bin.items(): num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) - + + # Step 6: List the remaining unselected indices other_idx = list(set(range(len(dataset))) - set(selected_idx)) return selected_idx, other_idx From bdc7dd6fba663f70bbd10ea3f08735347cd8669b Mon Sep 17 00:00:00 2001 From: yurujaja Date: Thu, 10 Oct 2024 16:03:13 +0200 Subject: [PATCH 13/17] enable stratified sampling and oversampling --- configs/train.yaml | 3 +- pangaea/engine/trainer.py | 5 +- pangaea/run.py | 23 ++++-- pangaea/utils/subset_sampler.py | 137 ++++++++++++++++++++------------ 4 files changed, 104 insertions(+), 64 deletions(-) diff --git a/configs/train.yaml b/configs/train.yaml index 5f47d1ed..239db1f8 100644 --- a/configs/train.yaml +++ b/configs/train.yaml @@ -14,10 +14,11 @@ finetune: false ckpt_dir: null limited_label_train: 1 limited_label_val: 1 -limited_label_strategy: stratified_classification # stratified_regression, random +limited_label_strategy: stratified # Options: stratified, oversampled, random stratification_bins: 3 # number of bins for stratified sampling, only for stratified + defaults: - task: ??? - dataset: ??? diff --git a/pangaea/engine/trainer.py b/pangaea/engine/trainer.py index e4e0f1d8..d10cec40 100644 --- a/pangaea/engine/trainer.py +++ b/pangaea/engine/trainer.py @@ -78,10 +78,7 @@ def __init__( self.training_metrics = {} self.best_ckpt = None self.best_metric_comp = operator.gt - if isinstance(self.train_loader.dataset, Subset): - self.num_classes = self.train_loader.dataset.dataset.num_classes - else: - self.num_classes = self.train_loader.dataset.num_classes + self.num_classes = self.train_loader.dataset.num_classes assert precision in [ "fp32", diff --git a/pangaea/run.py b/pangaea/run.py index 213b6d49..a5e3687d 100644 --- a/pangaea/run.py +++ b/pangaea/run.py @@ -29,21 +29,30 @@ from pangaea.datasets.base import GeoFMSubset -def get_exp_name(hydra_config: HydraConf) -> str: +def get_exp_info(hydra_config: HydraConf) -> str: """Create a unique experiment name based on the choices made in the config. Args: hydra_config (HydraConf): hydra config. Returns: - str: experiment name. + str: experiment information. """ choices = OmegaConf.to_container(hydra_config.runtime.choices) timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) fm = choices["encoder"] decoder = choices["decoder"] ds = choices["dataset"] - return f"{timestamp}-{fm}-{decoder}-{ds}" + task = choices["task"] + exp_info = { + "timestamp": timestamp, + "fm": fm, + "decoder": decoder, + "ds": ds, + "task": task, + "exp_name": f"{timestamp}_{fm}_{decoder}_{ds}", + } + return exp_info @hydra.main(version_base=None, config_path="../configs", config_name="train") @@ -66,7 +75,9 @@ def main(cfg: DictConfig) -> None: # true if training else false train_run = cfg.train if train_run: - exp_name = get_exp_name(HydraConfig.get()) + exp_info = get_exp_info(HydraConfig.get()) + exp_name = exp_info["exp_name"] + task_name = exp_info["task"] exp_dir = pathlib.Path(cfg.work_dir) / exp_name exp_dir.mkdir(parents=True, exist_ok=True) logger_path = exp_dir / "train.log" @@ -152,14 +163,14 @@ def main(cfg: DictConfig) -> None: if 0 < cfg.limited_label_train < 1: indices = get_subset_indices( - train_dataset, strategy=cfg.limited_label_strategy, + train_dataset, task=task_name, strategy=cfg.limited_label_strategy, label_fraction=cfg.limited_label_train, num_bins=cfg.stratification_bins, logger=logger ) train_dataset = GeoFMSubset(train_dataset, indices) if 0 < cfg.limited_label_val < 1: indices = get_subset_indices( - val_dataset, strategy=cfg.limited_label_strategy, + val_dataset, task=task_name, strategy=cfg.limited_label_strategy, label_fraction=cfg.limited_label_val, num_bins=cfg.stratification_bins, logger=logger ) val_dataset = GeoFMSubset(val_dataset, indices) diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py index fca604b0..b6cba9b7 100644 --- a/pangaea/utils/subset_sampler.py +++ b/pangaea/utils/subset_sampler.py @@ -1,27 +1,39 @@ import random from tqdm import tqdm import numpy as np +from pangaea.datasets.base import GeoFMDataset +from pangaea.datasets.base import GeoFMSubset -# Function to calculate class distributions for classification with a progress bar -def calculate_class_distributions(dataset, num_classes): +# Calculate image-wise class distributions for segmentation +def calculate_class_distributions(dataset: GeoFMDataset|GeoFMSubset): + num_classes = dataset.num_classes + ignore_index = dataset.ignore_index class_distributions = [] - # Adding a progress bar for dataset processing for idx in tqdm(range(len(dataset)), desc="Calculating class distributions per sample"): target = dataset[idx]['target'] + + if ignore_index is not None: + target=target[(target != ignore_index)] + total_pixels = target.numel() - class_counts = [(target == i).sum().item() for i in range(num_classes)] - class_ratios = [count / total_pixels for count in class_counts] - class_distributions.append(class_ratios) + if total_pixels == 0: + class_distributions.append([0] * num_classes) + continue + else: + class_counts = [(target == i).sum().item() for i in range(num_classes)] + class_ratios = [count / total_pixels for count in class_counts] + class_distributions.append(class_ratios) + + print(np.mean(class_distributions, axis=0)) return np.array(class_distributions) -# Function to calculate distribution metrics for regression -def calculate_regression_distributions(dataset): +# Calculate image-wise distributions for regression +def calculate_regression_distributions(dataset: GeoFMDataset|GeoFMSubset): distributions = [] - # Adding a progress bar for dataset processing for idx in tqdm(range(len(dataset)), desc="Calculating regression distributions per sample"): target = dataset[idx]['target'] mean_value = target.mean().item() # Example for patch-wise mean; adjust as needed for other metrics @@ -30,15 +42,15 @@ def calculate_regression_distributions(dataset): return np.array(distributions) -# Function to bin class distributions with a progress bar +# Function to bin class distributions def bin_class_distributions(class_distributions, num_bins=3, logger=None): logger.info(f"Class distributions are being binned into {num_bins} categories") - # Adding a progress bar for binning class distributions + binned_distributions = np.digitize(class_distributions, np.linspace(0, 1, num_bins+1)) - 1 return binned_distributions -# Function to bin regression distributions with a progress bar +# Function to bin regression distributions def bin_regression_distributions(regression_distributions, num_bins=3, logger=None): logger.info(f"Regression distributions are being binned into {num_bins} categories") # Define the range for binning based on minimum and maximum values in regression distributions @@ -49,59 +61,69 @@ def bin_regression_distributions(regression_distributions, num_bins=3, logger=No return binned_distributions -# Updated function to perform stratification for classification and return only the indices, with even bin selection -def stratify_classification_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None): - # Step 1: Calculate class distributions with progress tracking - class_distributions = calculate_class_distributions(dataset, num_classes) +def balance_seg_indices( + dataset:GeoFMDataset|GeoFMSubset, + strategy, + label_fraction=1.0, + num_bins=3, + logger=None): + # Calculate class distributions with progress tracking + class_distributions = calculate_class_distributions(dataset) - # Step 2: Bin the class distributions + # Bin the class distributions binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins, logger=logger) - - # Step 3: Prep a dictionary to hold indices for each bin combination - indices_per_bin = {} - - # Combine the bins for each class to create unique bin identifiers combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions) - # Populate the dictionary with indices based on combined bin identifiers + indices_per_bin = {} for idx, bin_id in enumerate(combined_bins): if bin_id not in indices_per_bin: indices_per_bin[bin_id] = [] indices_per_bin[bin_id].append(idx) - # Step 4: Select a proportion of indices from each bin - selected_idx = [] - for bin_id, indices in indices_per_bin.items(): - num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected - selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) - - # Step 5: Determine the remaining indices not selected + if strategy == "stratified": + # Select a proportion of indices from each bin + selected_idx = [] + for bin_id, indices in indices_per_bin.items(): + num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected + selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) + elif strategy == "oversampled": + # Prioritize the bins with the lowest values + sorted_indices = np.argsort(combined_bins) + selected_idx = sorted_indices[:int(len(dataset) * label_fraction)] + + # Determine the remaining indices not selected other_idx = list(set(range(len(dataset))) - set(selected_idx)) return selected_idx, other_idx -# Function to perform stratification for regression and return only the indices -def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3, logger=None): - # Step 1: Calculate regression distributions with progress tracking - regression_distributions = calculate_regression_distributions(dataset) +def balance_reg_indices( + dataset:GeoFMDataset|GeoFMSubset, + strategy, + label_fraction=1.0, + num_bins=3, + logger=None): - # Step 2: Bin the regression distributions + regression_distributions = calculate_regression_distributions(dataset) binned_distributions = bin_regression_distributions(regression_distributions, num_bins=num_bins, logger=logger) - # Step 3: Prep a dictionary to hold indices for each bin indices_per_bin = {i: [] for i in range(num_bins)} - # Step 4: Populate the indices per bin + # Populate the indices per bin for index, bin_index in enumerate(binned_distributions): if bin_index in indices_per_bin: indices_per_bin[bin_index].append(index) - - # Step 5: Select fraction of indices from each bin - selected_idx = [] - for bin_index, indices in indices_per_bin.items(): - num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected - selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) + + if strategy == "stratified": + # Select fraction of indices from each bin + selected_idx = [] + for bin_index, indices in indices_per_bin.items(): + num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected + selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) + elif strategy == "oversampled": + # Prioritize the bins with the lowest values + sorted_indices = np.argsort(binned_distributions) + selected_idx = sorted_indices[:int(len(dataset) * label_fraction)] other_idx = list(set(range(len(dataset))) - set(selected_idx)) @@ -109,23 +131,32 @@ def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3, # Function to get subset indices based on the strategy, supporting both classification and regression -def get_subset_indices(dataset, strategy="random", label_fraction=0.5, num_bins=3, logger=None): +def get_subset_indices(dataset: GeoFMDataset, + task="segmentation", + strategy="random", + label_fraction=0.5, + num_bins=3, + logger=None): logger.info( f"Creating a subset of the {dataset.split} dataset using {strategy} strategy, with {label_fraction * 100}% of labels utilized." ) - if strategy == "stratified_classification": - indices, _ = stratify_classification_dataset_indices( - dataset, num_classes=dataset.num_classes, label_fraction=label_fraction, num_bins=num_bins, logger=logger - ) - elif strategy == "stratified_regression": - indices, _ = stratify_regression_dataset_indices( - dataset, label_fraction=label_fraction, num_bins=num_bins, logger=logger - ) - else: # Default to random sampling + assert strategy in ["random", "stratified", "oversampled"], "Unsupported dataset subsampling strategy" + + if strategy == "random": n_samples = len(dataset) indices = random.sample( range(n_samples), int(n_samples * label_fraction) ) + return indices + + elif task == "segmentation": + indices, _ = balance_seg_indices( + dataset, strategy=strategy, label_fraction=label_fraction, num_bins=num_bins, logger=logger + ) + elif task == "regression": + indices, _ = balance_reg_indices( + dataset, strategy=strategy, label_fraction=label_fraction, num_bins=num_bins, logger=logger + ) return indices From 28238dbd08c45b9fe1d9824ea6a90614f508c8b9 Mon Sep 17 00:00:00 2001 From: yurujaja Date: Thu, 10 Oct 2024 16:39:10 +0200 Subject: [PATCH 14/17] add docstring --- pangaea/utils/subset_sampler.py | 65 ++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py index b6cba9b7..06b5e8d6 100644 --- a/pangaea/utils/subset_sampler.py +++ b/pangaea/utils/subset_sampler.py @@ -24,8 +24,6 @@ def calculate_class_distributions(dataset: GeoFMDataset|GeoFMSubset): class_counts = [(target == i).sum().item() for i in range(num_classes)] class_ratios = [count / total_pixels for count in class_counts] class_distributions.append(class_ratios) - - print(np.mean(class_distributions, axis=0)) return np.array(class_distributions) @@ -67,6 +65,37 @@ def balance_seg_indices( label_fraction=1.0, num_bins=3, logger=None): + """ + Balances and selects indices from a segmentation dataset based on the specified strategy. + + Args: + dataset : GeoFMDataset | GeoFMSubset + The dataset from which to select indices, typically containing geospatial segmentation data. + + strategy : str + The strategy to use for selecting indices. Options include: + - "stratified": Proportionally selects indices from each class bin based on the class distribution. + - "oversampled": Prioritizes and selects indices from bins with lower class representation. + + label_fraction : float, optional, default=1.0 + The fraction of labels (indices) to select from each class or bin. Values should be between 0 and 1. + + num_bins : int, optional, default=3 + The number of bins to divide the class distributions into, used for stratification or oversampling. + + logger : object, optional + A logger object for tracking progress or logging messages (e.g., `logging.Logger`) + + ------ + + Returns: + selected_idx : list of int + The indices of the selected samples based on the strategy and label fraction. + + other_idx : list of int + The remaining indices that were not selected. + + """ # Calculate class distributions with progress tracking class_distributions = calculate_class_distributions(dataset) @@ -104,6 +133,38 @@ def balance_reg_indices( num_bins=3, logger=None): + """ + Balances and selects indices from a regression dataset based on the specified strategy. + + Args: + dataset : GeoFMDataset | GeoFMSubset + The dataset from which to select indices, typically containing geospatial regression data. + + strategy : str + The strategy to use for selecting indices. Options include: + - "stratified": Proportionally selects indices from each bin based on the binned regression distributions. + - "oversampled": Prioritizes and selects indices from bins with lower representation. + + label_fraction : float, optional, default=1.0 + The fraction of indices to select from each bin. Values should be between 0 and 1. + + num_bins : int, optional, default=3 + The number of bins to divide the regression distributions into, used for stratification or oversampling. + + logger : object, optional + A logger object for tracking progress or logging messages (e.g., `logging.Logger`). If None, no logging is performed. + + ------ + + Returns: + selected_idx : list of int + The indices of the selected samples based on the strategy and label fraction. + + other_idx : list of int + The remaining indices that were not selected. + + """ + regression_distributions = calculate_regression_distributions(dataset) binned_distributions = bin_regression_distributions(regression_distributions, num_bins=num_bins, logger=logger) From afa76a0493ac08f255275f5eb66b6219f6a467bc Mon Sep 17 00:00:00 2001 From: Valerio Marsocci <49810041+VMarsocci@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:55:30 +0200 Subject: [PATCH 15/17] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7bd90efc..78f5c73c 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \ --config-name=train \ dataset=sen1floods11 \ encoder=unet_encoder \ - decoder=unet \ + decoder=seg_unet \ preprocessing=seg_default \ criterion=cross_entropy \ task=segmentation \ From 9159a83462926b2e8ad21292e1b74c85e4343883 Mon Sep 17 00:00:00 2001 From: Ritu Yadav <40523539+RituYadav92@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:02:07 +0200 Subject: [PATCH 16/17] Added comment to guide oversampling for biomass or regression in general --- pangaea/utils/subset_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py index 06b5e8d6..b73a218e 100644 --- a/pangaea/utils/subset_sampler.py +++ b/pangaea/utils/subset_sampler.py @@ -182,7 +182,7 @@ def balance_reg_indices( num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) elif strategy == "oversampled": - # Prioritize the bins with the lowest values + # Prioritize the bins with the lowest values. Can change to prioritize high values ( ex: high biomass samples) sorted_indices = np.argsort(binned_distributions) selected_idx = sorted_indices[:int(len(dataset) * label_fraction)] From 424535c3743faf789ab7626f8c1b45fc52e16ca8 Mon Sep 17 00:00:00 2001 From: Yuru Jia <91590963+yurujaja@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:08:43 +0200 Subject: [PATCH 17/17] Update a comment --- pangaea/utils/subset_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py index b73a218e..ffbf763d 100644 --- a/pangaea/utils/subset_sampler.py +++ b/pangaea/utils/subset_sampler.py @@ -182,7 +182,7 @@ def balance_reg_indices( num_to_select = int(max(1, len(indices) * label_fraction)) # Ensure at least one index is selected selected_idx.extend(np.random.choice(indices, num_to_select, replace=False)) elif strategy == "oversampled": - # Prioritize the bins with the lowest values. Can change to prioritize high values ( ex: high biomass samples) + # Prioritize bins with underrepresented values (e.g., high biomass samples) sorted_indices = np.argsort(binned_distributions) selected_idx = sorted_indices[:int(len(dataset) * label_fraction)]