From 73ca7ab567b300d883af94fc35f444bc70bc60b7 Mon Sep 17 00:00:00 2001
From: Ali Shibli <shibli@kth.se>
Date: Tue, 24 Sep 2024 15:02:07 +0200
Subject: [PATCH 01/17] add stratified sampling to training set

---
 environment.yaml |  3 ++-
 run.py           | 26 ++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/environment.yaml b/environment.yaml
index 1c486cf8..a3d6e4d3 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -28,4 +28,5 @@ dependencies:
   - google-cloud-storage
   - omegaconf
   - pydataverse
-  - pytest
\ No newline at end of file
+  - pytest
+  - pip install pyDataverse
\ No newline at end of file
diff --git a/run.py b/run.py
index 4f20c1da..682caec3 100644
--- a/run.py
+++ b/run.py
@@ -10,6 +10,7 @@
 import torch
 from torch.utils.data import DataLoader, Subset
 from torch.utils.data.distributed import DistributedSampler
+from sklearn.model_selection import StratifiedShuffleSplit
 
 import foundation_models
 import datasets
@@ -86,6 +87,8 @@
                     help="Percentage of the dataset to use as a decimal, \
                           (e.g., 0.1 for 10%). Default -1 to use the entire dataset.")
 
+parser.add_argument("--stratified_sampling", action="store_true", help="use stratified sampling for dataset splitting")
+
 parser.add_argument("--seed", type=int,
                     help="random seed")
 parser.add_argument("--num_workers", type=int,
@@ -246,13 +249,28 @@ def main():
         )
     )
     collate_fn = get_collate_fn(cfg)
+    
     # training
     if not cfg.eval_dir:
         if 0 < cfg.limited_label < 1:
-            indices = random.sample(range(len(train_dataset)), int(len(train_dataset)*cfg.limited_label))
-            train_dataset = Subset(train_dataset, indices)
-            perc = cfg.limited_label*100
-            logger.info(f"Created a subset of the train dataset, with {perc}% of the labels available")
+            if cfg.stratified_sampling:
+                # Use stratified sampling with the limited label
+                stratified_split = StratifiedShuffleSplit(n_splits=1, train_size=cfg.limited_label, random_state=cfg.seed)
+                
+                labels = train_dataset.targets  # Adjust depending on how labels are stored in your dataset
+                for train_idx, _ in stratified_split.split(torch.zeros(len(labels)), labels):
+                    train_dataset = Subset(train_dataset, train_idx)
+                
+                perc = cfg.limited_label * 100
+                logger.info(f"Created a stratified subset of the train dataset, with {perc}% of the labels available.")
+            
+            else:
+                # Randomly sample a percentage of the dataset
+                indices = random.sample(range(len(train_dataset)), int(len(train_dataset) * cfg.limited_label))
+                train_dataset = Subset(train_dataset, indices)
+                
+                perc = cfg.limited_label * 100
+                logger.info(f"Created a random subset of the train dataset, with {perc}% of the labels available.")
         else:
             logger.info(f"The entire train dataset will be used.")
 

From 00ce22e9caa4e48d8aa4a8ccaa869769ea4c8fa2 Mon Sep 17 00:00:00 2001
From: Ali Shibli <shibli@kth.se>
Date: Thu, 26 Sep 2024 13:51:47 +0200
Subject: [PATCH 02/17] add function in hls class

---
 datasets/hlsburnscars.py | 23 +++++++++++++++++++++++
 run.py                   |  3 ++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/datasets/hlsburnscars.py b/datasets/hlsburnscars.py
index 91dfb579..ffaf7da2 100644
--- a/datasets/hlsburnscars.py
+++ b/datasets/hlsburnscars.py
@@ -36,6 +36,29 @@ def __init__(self, cfg, split, is_train=True):
             # T.Resize((self.height, self.height), antialias=False),
             T.Normalize(mean=self.data_mean['optical'], std=self.data_std['optical'])
         ])
+        self.labels = self._generate_image_labels()
+
+    def _generate_image_labels(self):
+        """ Generate image-level labels for stratification based on the median threshold of burned pixels. """
+        burned_ratios = []
+        
+        # Step 1: Calculate the burned ratio for each image
+        for mask_path in self.target_list:
+            with rasterio.open(mask_path) as src:
+                mask = src.read(1)
+                total_pixels = mask.size
+                burned_pixels = np.sum(mask == 1)
+                burned_ratio = burned_pixels / total_pixels
+                burned_ratios.append(burned_ratio)
+        
+        # Step 2: Calculate the median burned ratio
+        median_burned_ratio = np.median(burned_ratios)
+        
+        # Step 3: Generate 0/1 labels based on whether burned ratio is above or below the median
+        labels = [(1 if ratio > median_burned_ratio else 0) for ratio in burned_ratios]
+        
+        return labels
+
 
     def __len__(self):
         return len(self.image_list)
diff --git a/run.py b/run.py
index 682caec3..d3dd5eb0 100644
--- a/run.py
+++ b/run.py
@@ -257,7 +257,8 @@ def main():
                 # Use stratified sampling with the limited label
                 stratified_split = StratifiedShuffleSplit(n_splits=1, train_size=cfg.limited_label, random_state=cfg.seed)
                 
-                labels = train_dataset.targets  # Adjust depending on how labels are stored in your dataset
+                labels = train_dataset.labels  # Adjust depending on how labels are stored in your dataset
+                
                 for train_idx, _ in stratified_split.split(torch.zeros(len(labels)), labels):
                     train_dataset = Subset(train_dataset, train_idx)
                 

From 90e8c3ca8c46339332d54131aa27b229e7668092 Mon Sep 17 00:00:00 2001
From: Ali Shibli <shibli@kth.se>
Date: Wed, 2 Oct 2024 19:41:10 +0200
Subject: [PATCH 03/17] add geofmsubset class

---
 pangaea/datasets/base.py         | 31 ++++++++++++++++++++++++++++++-
 pangaea/datasets/hlsburnscars.py | 18 +-----------------
 pangaea/run.py                   | 16 ++++++++--------
 3 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/pangaea/datasets/base.py b/pangaea/datasets/base.py
index b1fda6d1..0470d143 100644
--- a/pangaea/datasets/base.py
+++ b/pangaea/datasets/base.py
@@ -1,5 +1,5 @@
 import torch
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, Subset
 import os
 
 class GeoFMDataset(Dataset):
@@ -115,3 +115,32 @@ def download(self) -> None:
             NotImplementedError: raise if the method is not implemented
         """
         raise NotImplementedError
+
+
+class GeoFMSubset(Subset):
+    """Custom subset class that retains dataset attributes."""
+
+    def __init__(self, dataset, indices):
+        super().__init__(dataset, indices)
+        
+        # Copy relevant attributes from the original dataset
+        self.dataset_name = getattr(dataset, 'dataset_name', None)
+        self.root_path = getattr(dataset, 'root_path', None)
+        self.auto_download = getattr(dataset, 'auto_download', None)
+        self.img_size = getattr(dataset, 'img_size', None)
+        self.multi_temporal = getattr(dataset, 'multi_temporal', None)
+        self.multi_modal = getattr(dataset, 'multi_modal', None)
+        self.ignore_index = getattr(dataset, 'ignore_index', None)
+        self.num_classes = getattr(dataset, 'num_classes', None)
+        self.classes = getattr(dataset, 'classes', None)
+        self.distribution = getattr(dataset, 'distribution', None)
+        self.bands = getattr(dataset, 'bands', None)
+        self.data_mean = getattr(dataset, 'data_mean', None)
+        self.data_std = getattr(dataset, 'data_std', None)
+        self.data_min = getattr(dataset, 'data_min', None)
+        self.data_max = getattr(dataset, 'data_max', None)
+
+    def filter_by_indices(self, indices):
+        """Apply filtering by indices directly in this subset."""
+        return GeoFMSubset(self.dataset, indices)
+
diff --git a/pangaea/datasets/hlsburnscars.py b/pangaea/datasets/hlsburnscars.py
index c2a2e0b9..6a5fee97 100644
--- a/pangaea/datasets/hlsburnscars.py
+++ b/pangaea/datasets/hlsburnscars.py
@@ -143,7 +143,6 @@ def __len__(self):
         return len(self.image_list)
 
     def __getitem__(self, index):
-
         image = tiff.imread(self.image_list[index])
         image = image.astype(np.float32)  # Convert to float32
         image = torch.from_numpy(image).permute(2, 0, 1)
@@ -155,7 +154,6 @@ def __getitem__(self, index):
         invalid_mask = image == 9999
         image[invalid_mask] = 0
 
-
         output = {
             'image': {
                 'optical': image,
@@ -163,23 +161,9 @@ def __getitem__(self, index):
             'target': target,
             'metadata': {}
         }
-        
-        return output
 
-    
-    @staticmethod
-    def get_stratified_train_val_split(all_files) -> Tuple[Sequence[int], Sequence[int]]:
+        return output
 
-       # Fixed stratified sample to split data into train/val. 
-       # This keeps 90% of datapoints belonging to an individual event in the training set and puts the remaining 10% in the validation set. 
-        disaster_names = list(
-            map(lambda path: pathlib.Path(path).name.split("_")[0], all_files))
-        train_idxs, val_idxs = train_test_split(np.arange(len(all_files)),
-                                                test_size=0.1,
-                                                random_state=23,
-                                                stratify=disaster_names)
-        return {"train": train_idxs, "val": val_idxs}
-    
     @staticmethod
     def download(self, silent=False):
         output_path = pathlib.Path(self.root_path)
diff --git a/pangaea/run.py b/pangaea/run.py
index 9312ff53..326b6f0e 100644
--- a/pangaea/run.py
+++ b/pangaea/run.py
@@ -26,7 +26,7 @@
     seed_worker,
 )
 from pangaea.utils.stratification import stratify_single_dataset_indices
-
+from pangaea.datasets.base import GeoFMSubset
 
 def get_exp_name(hydra_config: HydraConf) -> str:
     """Create a unique experiment name based on the choices made in the config.
@@ -154,15 +154,15 @@ def main(cfg: DictConfig) -> None:
                 preprocess, dataset=val_dataset, encoder=encoder
             )
         if 0 < cfg.limited_label < 1:
-            n_train_samples = len(train_dataset)
-            indices = random.sample(
-                range(n_train_samples), int(n_train_samples * cfg.limited_label)
-            )
-            # labeled_train_idx, _ = stratify_single_dataset_indices(train_dataset, num_classes=2, label_fraction=0.5, num_bins=3)
-            # indices = labeled_train_idx
-            train_dataset = Subset(train_dataset, indices)
+            # n_train_samples = len(train_dataset)
+            # indices = random.sample(
+            #     range(n_train_samples), int(n_train_samples * cfg.limited_label)
+            # )
+            indices, _ = stratify_single_dataset_indices(train_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3)
+            train_dataset = GeoFMSubset(train_dataset, indices)
             logger.info(
                 f"Created a subset of the train dataset, with {cfg.limited_label * 100}% of the labels available"
+                f"Total number of patches used: {len(train_dataset)}"
             )
         else:
             logger.info("The entire train dataset will be used.")

From b3a5a1a93e1dd5862da4ecfe33ef1999c6c64176 Mon Sep 17 00:00:00 2001
From: Ali Shibli <alishibli97@hotmail.com>
Date: Wed, 2 Oct 2024 20:16:22 +0200
Subject: [PATCH 04/17] add val stratification and logging info

---
 pangaea/datasets/base.py        |  1 +
 pangaea/run.py                  | 14 +++++++++++---
 pangaea/utils/stratification.py | 16 +++++++++-------
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/pangaea/datasets/base.py b/pangaea/datasets/base.py
index 0470d143..3cd3388e 100644
--- a/pangaea/datasets/base.py
+++ b/pangaea/datasets/base.py
@@ -139,6 +139,7 @@ def __init__(self, dataset, indices):
         self.data_std = getattr(dataset, 'data_std', None)
         self.data_min = getattr(dataset, 'data_min', None)
         self.data_max = getattr(dataset, 'data_max', None)
+        self.split = getattr(dataset, 'split', None)
 
     def filter_by_indices(self, indices):
         """Apply filtering by indices directly in this subset."""
diff --git a/pangaea/run.py b/pangaea/run.py
index 326b6f0e..855f87c6 100644
--- a/pangaea/run.py
+++ b/pangaea/run.py
@@ -158,11 +158,19 @@ def main(cfg: DictConfig) -> None:
             # indices = random.sample(
             #     range(n_train_samples), int(n_train_samples * cfg.limited_label)
             # )
-            indices, _ = stratify_single_dataset_indices(train_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3)
+            
+            # Stratify train dataset
+            indices, _ = stratify_single_dataset_indices(train_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3, logger=logger)
             train_dataset = GeoFMSubset(train_dataset, indices)
+
+            # Stratify validation dataset
+            indices, _ = stratify_single_dataset_indices(val_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3, logger=logger)
+            val_dataset = GeoFMSubset(val_dataset, indices)
+            
             logger.info(
-                f"Created a subset of the train dataset, with {cfg.limited_label * 100}% of the labels available"
-                f"Total number of patches used: {len(train_dataset)}"
+                f"Created a subset of the train and val dataset, with {cfg.limited_label * 100}% of the labels available\n"
+                f"Total number of train patches: {len(train_dataset)}\n"
+                f"Total number of validation patches: {len(val_dataset)}\n"
             )
         else:
             logger.info("The entire train dataset will be used.")
diff --git a/pangaea/utils/stratification.py b/pangaea/utils/stratification.py
index 3a41722b..b88146d9 100644
--- a/pangaea/utils/stratification.py
+++ b/pangaea/utils/stratification.py
@@ -17,28 +17,30 @@ def calculate_class_distributions(dataset, num_classes):
     return np.array(class_distributions)
 
 # Function to bin class distributions with a progress bar
-def bin_class_distributions(class_distributions, num_bins=3):
-    print("Binning class distributions...")
+def bin_class_distributions(class_distributions, num_bins=3, logger=None):
+
+    logger.info("Binning class distributions...")
     # Adding a progress bar for binning class distributions
     binned_distributions = np.digitize(class_distributions, np.linspace(0, 1, num_bins+1)) - 1
     return binned_distributions
 
 # Function to perform stratification and return only the indices
-def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3):
-    print("Starting stratification...")
+def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None):
+    
+    logger.info("Starting stratification...")
 
     # Step 1: Calculate class distributions with progress tracking
     class_distributions = calculate_class_distributions(dataset, num_classes)
 
     # Step 2: Bin the class distributions
-    binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins)
+    binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins, logger=logger)
 
     # Step 3: Combine the bins to use for stratification
     combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions)
 
     # Step 4: Select a subset of labeled data with progress tracking
     num_labeled = int(len(dataset) * label_fraction)
-    print(f"Selecting {label_fraction * 100:.0f}% labeled data from {len(dataset)} samples...")
+    logger.info(f"Selecting {label_fraction * 100:.0f}% labeled data from {len(dataset)} samples...")
 
     # Shuffle and take the labeled part of the dataset based on the binned distributions
     indices = np.arange(len(dataset))
@@ -49,5 +51,5 @@ def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, nu
     labeled_idx = sorted_indices[:num_labeled]
     unlabeled_idx = sorted_indices[num_labeled:]
 
-    print("Stratification complete.")
+    logger.info("Stratification complete.")
     return labeled_idx, unlabeled_idx

From f709e357c871b765009595e782b8e4dc85a4ded8 Mon Sep 17 00:00:00 2001
From: Ali Shibli <alishibli97@hotmail.com>
Date: Wed, 2 Oct 2024 20:46:48 +0200
Subject: [PATCH 05/17] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9c2da1b6..ca1d74bd 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ We provide several ways to install the dependencies.
 
 ## 🏋️ Training
 
-To run experiments, please refer to `configs/train.yaml`. In it, in addition to some basic info about training (e.g. `finetune` for fine-tuning also the encoder, `limited_label` to train the model on a subset of labels, `num_workers`, `batch_size` and so on), there are 5 different basic configs:
+To run experiments, please refer to `configs/train.yaml`. In it, in addition to some basic info about training (e.g. `finetune` for fine-tuning also the encoder, `limited_label` to train the model on a stratified subset of labels, `num_workers`, `batch_size` and so on), there are 5 different basic configs:
 - `dataset`: Information of downstream datasets such as image size, band_statistics, classes etc.
 - `decoder`: Downstream task decoder fine-tuning related parameters, like the type of architecture (e.g. UPerNet), which multi-temporal strategy to use, and other related hparams (e.g. nr of channels)
 - `encoder`: GFM encoder related parameters. `output_layers` is used for which layers are used for Upernet decoder.  

From f396083f2593804722bd15deaf2b07908948aa93 Mon Sep 17 00:00:00 2001
From: yurujaja <guojiangjyr@gmail.com>
Date: Mon, 7 Oct 2024 14:08:37 +0200
Subject: [PATCH 06/17] re-add hlsburn train-val split

---
 pangaea/datasets/base.py         |  1 +
 pangaea/datasets/hlsburnscars.py | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/pangaea/datasets/base.py b/pangaea/datasets/base.py
index 3cd3388e..f611b5c1 100644
--- a/pangaea/datasets/base.py
+++ b/pangaea/datasets/base.py
@@ -127,6 +127,7 @@ def __init__(self, dataset, indices):
         self.dataset_name = getattr(dataset, 'dataset_name', None)
         self.root_path = getattr(dataset, 'root_path', None)
         self.auto_download = getattr(dataset, 'auto_download', None)
+        self.download_url = getattr(dataset, 'download_url', None)
         self.img_size = getattr(dataset, 'img_size', None)
         self.multi_temporal = getattr(dataset, 'multi_temporal', None)
         self.multi_modal = getattr(dataset, 'multi_modal', None)
diff --git a/pangaea/datasets/hlsburnscars.py b/pangaea/datasets/hlsburnscars.py
index 6a5fee97..8774448d 100644
--- a/pangaea/datasets/hlsburnscars.py
+++ b/pangaea/datasets/hlsburnscars.py
@@ -2,21 +2,17 @@
 import time
 import torch
 import numpy as np
-# import rasterio
 import tifffile as tiff
-from typing import Sequence, Dict, Any, Union, Literal, Tuple
+from typing import Sequence, Tuple
 from sklearn.model_selection import train_test_split
 from glob import glob
 
 import torch
-import torchvision.transforms.functional as TF
-import torchvision.transforms as T
 
 import pathlib
 import urllib
 import tarfile
 
-# from utils.registry import DATASET_REGISTRY
 from pangaea.datasets.utils import DownloadProgressBar
 from pangaea.datasets.base import GeoFMDataset
 
@@ -164,6 +160,20 @@ def __getitem__(self, index):
 
         return output
 
+    
+    @staticmethod
+    def get_stratified_train_val_split(all_files) -> Tuple[Sequence[int], Sequence[int]]:
+
+       # Fixed stratified sample to split data into train/val. 
+       # This keeps 90% of datapoints belonging to an individual event in the training set and puts the remaining 10% in the validation set. 
+        disaster_names = list(
+            map(lambda path: pathlib.Path(path).name.split("_")[0], all_files))
+        train_idxs, val_idxs = train_test_split(np.arange(len(all_files)),
+                                                test_size=0.1,
+                                                random_state=23,
+                                                stratify=disaster_names)
+        return {"train": train_idxs, "val": val_idxs}
+    
     @staticmethod
     def download(self, silent=False):
         output_path = pathlib.Path(self.root_path)
@@ -195,6 +205,4 @@ def download(self, silent=False):
             tar.extractall(output_path)
             print("done.")
 
-        os.remove(output_path / temp_file_name)
-
-
+        os.remove(output_path / temp_file_name)
\ No newline at end of file

From 8080464ace3ad18858f35743ff9079cede5dc490 Mon Sep 17 00:00:00 2001
From: yurujaja <guojiangjyr@gmail.com>
Date: Mon, 7 Oct 2024 17:07:07 +0200
Subject: [PATCH 07/17] limited label for both train and val, random or
 stratified sampling

---
 README.md                                     |  8 ++--
 configs/train.yaml                            |  6 ++-
 pangaea/datasets/hlsburnscars.py              | 15 +------
 pangaea/run.py                                | 40 ++++++++-----------
 .../{stratification.py => subset_sampler.py}  | 39 +++++++++++-------
 5 files changed, 53 insertions(+), 55 deletions(-)
 rename pangaea/utils/{stratification.py => subset_sampler.py} (68%)

diff --git a/README.md b/README.md
index ca1d74bd..7bd90efc 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ We provide several ways to install the dependencies.
 
 ## 🏋️ Training
 
-To run experiments, please refer to `configs/train.yaml`. In it, in addition to some basic info about training (e.g. `finetune` for fine-tuning also the encoder, `limited_label` to train the model on a stratified subset of labels, `num_workers`, `batch_size` and so on), there are 5 different basic configs:
+To run experiments, please refer to `configs/train.yaml`. In it, in addition to some basic info about training (e.g. `finetune` for fine-tuning also the encoder, `limited_label_train` to train the model on a stratified subset of labels, `num_workers`, `batch_size` and so on), there are 5 different basic configs:
 - `dataset`: Information of downstream datasets such as image size, band_statistics, classes etc.
 - `decoder`: Downstream task decoder fine-tuning related parameters, like the type of architecture (e.g. UPerNet), which multi-temporal strategy to use, and other related hparams (e.g. nr of channels)
 - `encoder`: GFM encoder related parameters. `output_layers` is used for which layers are used for Upernet decoder.  
@@ -136,7 +136,7 @@ torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \
    task=segmentation
 ```
 
-If you want to overwrite some parameters (e.g. turn off wandbe, and changing batch size and the path to the dataset):
+If you want to overwrite some parameters (e.g. turn off wandbe, change batch size and the path to the dataset, and use 50% stratified sampled subset for training):
 ```
 torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \
    --config-name=train \
@@ -148,7 +148,9 @@ torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \
    task=segmentation \
    dataset.root_path= /path/to/the/dataset/hlsburnscars \
    batch_size=16 \
-   use_wandb=False
+   use_wandb=False \
+   limited_label_train=0.5 \
+   limited_label_strategy=stratified
 ```
 
 #### Multi-Temporal Semantic Segmentation
diff --git a/configs/train.yaml b/configs/train.yaml
index 5e97e1ae..1c9e7ade 100644
--- a/configs/train.yaml
+++ b/configs/train.yaml
@@ -12,7 +12,11 @@ batch_size: 32
 # EXPERIMENT
 finetune: false
 ckpt_dir: null
-limited_label: 1
+limited_label_train: 1
+limited_label_val: 1
+limited_label_strategy: stratified  # stratified, random
+stratification_bins: 3 #  number of bins for stratified sampling, only for stratified
+
 
 defaults:
   - task: ???
diff --git a/pangaea/datasets/hlsburnscars.py b/pangaea/datasets/hlsburnscars.py
index 8774448d..0678660e 100644
--- a/pangaea/datasets/hlsburnscars.py
+++ b/pangaea/datasets/hlsburnscars.py
@@ -16,7 +16,7 @@
 from pangaea.datasets.utils import DownloadProgressBar
 from pangaea.datasets.base import GeoFMDataset
 
-# @DATASET_REGISTRY.register()
+
 class HLSBurnScars(GeoFMDataset):
     def __init__(
         self,
@@ -161,19 +161,6 @@ def __getitem__(self, index):
         return output
 
     
-    @staticmethod
-    def get_stratified_train_val_split(all_files) -> Tuple[Sequence[int], Sequence[int]]:
-
-       # Fixed stratified sample to split data into train/val. 
-       # This keeps 90% of datapoints belonging to an individual event in the training set and puts the remaining 10% in the validation set. 
-        disaster_names = list(
-            map(lambda path: pathlib.Path(path).name.split("_")[0], all_files))
-        train_idxs, val_idxs = train_test_split(np.arange(len(all_files)),
-                                                test_size=0.1,
-                                                random_state=23,
-                                                stratify=disaster_names)
-        return {"train": train_idxs, "val": val_idxs}
-    
     @staticmethod
     def download(self, silent=False):
         output_path = pathlib.Path(self.root_path)
diff --git a/pangaea/run.py b/pangaea/run.py
index 855f87c6..213b6d49 100644
--- a/pangaea/run.py
+++ b/pangaea/run.py
@@ -10,7 +10,7 @@
 from hydra.core.hydra_config import HydraConfig
 from hydra.utils import instantiate
 from omegaconf import DictConfig, OmegaConf
-from torch.utils.data import DataLoader, Dataset, Subset
+from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.distributed import DistributedSampler
 
 from pangaea.decoders.base import Decoder
@@ -25,9 +25,10 @@
     get_generator,
     seed_worker,
 )
-from pangaea.utils.stratification import stratify_single_dataset_indices
+from pangaea.utils.subset_sampler import get_subset_indices
 from pangaea.datasets.base import GeoFMSubset
 
+
 def get_exp_name(hydra_config: HydraConf) -> str:
     """Create a unique experiment name based on the choices made in the config.
 
@@ -140,11 +141,6 @@ def main(cfg: DictConfig) -> None:
     # training
     if train_run:
 
-        # Step 1: Stratify the dataset to get indices
-        # labeled_train_idx, _ = stratify_single_dataset_indices(train_dataset, num_classes=2, label_fraction=0.5, num_bins=3)
-        # Step 2: Use Subset to filter the dataset with stratified indices
-        # train_dataset = Subset(train_dataset, labeled_train_idx)
-
         for preprocess in cfg.preprocessing.train:
             train_dataset: Dataset = instantiate(
                 preprocess, dataset=train_dataset, encoder=encoder
@@ -153,27 +149,25 @@ def main(cfg: DictConfig) -> None:
             val_dataset: Dataset = instantiate(
                 preprocess, dataset=val_dataset, encoder=encoder
             )
-        if 0 < cfg.limited_label < 1:
-            # n_train_samples = len(train_dataset)
-            # indices = random.sample(
-            #     range(n_train_samples), int(n_train_samples * cfg.limited_label)
-            # )
-            
-            # Stratify train dataset
-            indices, _ = stratify_single_dataset_indices(train_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3, logger=logger)
-            train_dataset = GeoFMSubset(train_dataset, indices)
 
-            # Stratify validation dataset
-            indices, _ = stratify_single_dataset_indices(val_dataset, num_classes=cfg.dataset.num_classes, label_fraction=cfg.limited_label, num_bins=3, logger=logger)
-            val_dataset = GeoFMSubset(val_dataset, indices)
+        if 0 < cfg.limited_label_train < 1:
+            indices = get_subset_indices(
+                train_dataset, strategy=cfg.limited_label_strategy, 
+                label_fraction=cfg.limited_label_train, num_bins=cfg.stratification_bins, logger=logger
+            )
+            train_dataset = GeoFMSubset(train_dataset, indices)
             
-            logger.info(
-                f"Created a subset of the train and val dataset, with {cfg.limited_label * 100}% of the labels available\n"
+        if 0 < cfg.limited_label_val < 1:
+            indices = get_subset_indices(
+                val_dataset, strategy=cfg.limited_label_strategy, 
+                label_fraction=cfg.limited_label_val, num_bins=cfg.stratification_bins, logger=logger
+            )
+            val_dataset = GeoFMSubset(val_dataset, indices)
+                
+        logger.info(
                 f"Total number of train patches: {len(train_dataset)}\n"
                 f"Total number of validation patches: {len(val_dataset)}\n"
             )
-        else:
-            logger.info("The entire train dataset will be used.")
 
         # get train val data loaders
         train_loader = DataLoader(
diff --git a/pangaea/utils/stratification.py b/pangaea/utils/subset_sampler.py
similarity index 68%
rename from pangaea/utils/stratification.py
rename to pangaea/utils/subset_sampler.py
index b88146d9..ab6c2e59 100644
--- a/pangaea/utils/stratification.py
+++ b/pangaea/utils/subset_sampler.py
@@ -1,55 +1,66 @@
+import random
 from tqdm import tqdm
 import numpy as np
-from torch.utils.data import Subset, DataLoader
+
 
 # Function to calculate class distributions with a progress bar
 def calculate_class_distributions(dataset, num_classes):
     class_distributions = []
-    
+
     # Adding a progress bar for dataset processing
-    for idx in tqdm(range(len(dataset)), desc="Calculating Class Distributions"):
+    for idx in tqdm(range(len(dataset)), desc="Calculating class distributions per sample"):
         target = dataset[idx]['target']
         total_pixels = target.numel()
         class_counts = [(target == i).sum().item() for i in range(num_classes)]
         class_ratios = [count / total_pixels for count in class_counts]
         class_distributions.append(class_ratios)
-    
+
     return np.array(class_distributions)
 
 # Function to bin class distributions with a progress bar
 def bin_class_distributions(class_distributions, num_bins=3, logger=None):
 
-    logger.info("Binning class distributions...")
+    logger.info(f"Class distributions are being binned into {num_bins} categories")
     # Adding a progress bar for binning class distributions
     binned_distributions = np.digitize(class_distributions, np.linspace(0, 1, num_bins+1)) - 1
     return binned_distributions
 
 # Function to perform stratification and return only the indices
 def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None):
-    
-    logger.info("Starting stratification...")
 
     # Step 1: Calculate class distributions with progress tracking
     class_distributions = calculate_class_distributions(dataset, num_classes)
 
     # Step 2: Bin the class distributions
     binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins, logger=logger)
-
+    
     # Step 3: Combine the bins to use for stratification
     combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions)
 
     # Step 4: Select a subset of labeled data with progress tracking
     num_labeled = int(len(dataset) * label_fraction)
-    logger.info(f"Selecting {label_fraction * 100:.0f}% labeled data from {len(dataset)} samples...")
-
-    # Shuffle and take the labeled part of the dataset based on the binned distributions
-    indices = np.arange(len(dataset))
-    np.random.shuffle(indices)
 
     # Sort the indices based on combined bins to preserve class distribution
     sorted_indices = np.argsort(combined_bins)
     labeled_idx = sorted_indices[:num_labeled]
     unlabeled_idx = sorted_indices[num_labeled:]
 
-    logger.info("Stratification complete.")
     return labeled_idx, unlabeled_idx
+
+
+# Function to get subset indices based on the strategy
+def get_subset_indices(dataset, strategy="random", label_fraction=0.5, num_bins=3, logger=None):
+    logger.info(
+        f"Creat a subset of the {dataset.split} dataset using {strategy} strategy, with {label_fraction * 100}% of labels utilized."
+        )
+    if strategy == "stratified":
+        indices, _ = stratify_single_dataset_indices(
+            dataset, num_classes=dataset.num_classes, label_fraction=label_fraction, num_bins=num_bins, logger=logger
+        )
+    else: 
+        n_samples = len(dataset)
+        indices = random.sample(
+            range(n_samples), int(n_samples * label_fraction)
+        )
+    
+    return indices
\ No newline at end of file

From f25a4455796f6ba66e368432b9de2f6483834c34 Mon Sep 17 00:00:00 2001
From: Ali Shibli <alishibli97@hotmail.com>
Date: Tue, 8 Oct 2024 14:16:06 +0200
Subject: [PATCH 08/17] add regression stratification

---
 configs/train.yaml              |  2 +-
 pangaea/utils/subset_sampler.py | 69 +++++++++++++++++++++++++++------
 2 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/configs/train.yaml b/configs/train.yaml
index 1c9e7ade..5f47d1ed 100644
--- a/configs/train.yaml
+++ b/configs/train.yaml
@@ -14,7 +14,7 @@ finetune: false
 ckpt_dir: null
 limited_label_train: 1
 limited_label_val: 1
-limited_label_strategy: stratified  # stratified, random
+limited_label_strategy: stratified_classification  # stratified_regression, random
 stratification_bins: 3 #  number of bins for stratified sampling, only for stratified
 
 
diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py
index ab6c2e59..9d6290e1 100644
--- a/pangaea/utils/subset_sampler.py
+++ b/pangaea/utils/subset_sampler.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 
-# Function to calculate class distributions with a progress bar
+# Function to calculate class distributions for classification with a progress bar
 def calculate_class_distributions(dataset, num_classes):
     class_distributions = []
 
@@ -17,17 +17,41 @@ def calculate_class_distributions(dataset, num_classes):
 
     return np.array(class_distributions)
 
+
+# Function to calculate distribution metrics for regression
+def calculate_regression_distributions(dataset):
+    distributions = []
+
+    # Adding a progress bar for dataset processing
+    for idx in tqdm(range(len(dataset)), desc="Calculating regression distributions per sample"):
+        target = dataset[idx]['target']
+        mean_value = target.mean().item()  # Example for mean; adjust as needed for other metrics
+        distributions.append(mean_value)
+
+    return np.array(distributions)
+
+
 # Function to bin class distributions with a progress bar
 def bin_class_distributions(class_distributions, num_bins=3, logger=None):
-
     logger.info(f"Class distributions are being binned into {num_bins} categories")
     # Adding a progress bar for binning class distributions
     binned_distributions = np.digitize(class_distributions, np.linspace(0, 1, num_bins+1)) - 1
     return binned_distributions
 
-# Function to perform stratification and return only the indices
-def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None):
 
+# Function to bin regression distributions with a progress bar
+def bin_regression_distributions(regression_distributions, num_bins=3, logger=None):
+    logger.info(f"Regression distributions are being binned into {num_bins} categories")
+    # Define the range for binning based on minimum and maximum values in regression distributions
+    binned_distributions = np.digitize(
+        regression_distributions, 
+        np.linspace(regression_distributions.min(), regression_distributions.max(), num_bins + 1)
+    ) - 1
+    return binned_distributions
+
+
+# Function to perform stratification for classification and return only the indices
+def stratify_classification_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None):
     # Step 1: Calculate class distributions with progress tracking
     class_distributions = calculate_class_distributions(dataset, num_classes)
 
@@ -48,19 +72,42 @@ def stratify_single_dataset_indices(dataset, num_classes, label_fraction=1.0, nu
     return labeled_idx, unlabeled_idx
 
 
-# Function to get subset indices based on the strategy
+# Function to perform stratification for regression and return only the indices
+def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3, logger=None):
+    # Step 1: Calculate regression distributions with progress tracking
+    regression_distributions = calculate_regression_distributions(dataset)
+
+    # Step 2: Bin the regression distributions
+    binned_distributions = bin_regression_distributions(regression_distributions, num_bins=num_bins, logger=logger)
+    
+    # Step 3: Sort the indices based on binned distributions for stratification
+    sorted_indices = np.argsort(binned_distributions)
+    
+    # Step 4: Select a subset of labeled data with progress tracking
+    num_labeled = int(len(dataset) * label_fraction)
+    labeled_idx = sorted_indices[:num_labeled]
+    unlabeled_idx = sorted_indices[num_labeled:]
+
+    return labeled_idx, unlabeled_idx
+
+
+# Function to get subset indices based on the strategy, supporting both classification and regression
 def get_subset_indices(dataset, strategy="random", label_fraction=0.5, num_bins=3, logger=None):
     logger.info(
-        f"Creat a subset of the {dataset.split} dataset using {strategy} strategy, with {label_fraction * 100}% of labels utilized."
-        )
-    if strategy == "stratified":
-        indices, _ = stratify_single_dataset_indices(
+        f"Creating a subset of the {dataset.split} dataset using {strategy} strategy, with {label_fraction * 100}% of labels utilized."
+    )
+    if strategy == "stratified_classification":
+        indices, _ = stratify_classification_dataset_indices(
             dataset, num_classes=dataset.num_classes, label_fraction=label_fraction, num_bins=num_bins, logger=logger
         )
-    else: 
+    elif strategy == "stratified_regression":
+        indices, _ = stratify_regression_dataset_indices(
+            dataset, label_fraction=label_fraction, num_bins=num_bins, logger=logger
+        )
+    else:  # Default to random sampling
         n_samples = len(dataset)
         indices = random.sample(
             range(n_samples), int(n_samples * label_fraction)
         )
     
-    return indices
\ No newline at end of file
+    return indices

From 33053c2d4827b7f4df8ee41e2a8f9734ae8114ef Mon Sep 17 00:00:00 2001
From: Ritu Yadav <40523539+RituYadav92@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:33:37 +0200
Subject: [PATCH 09/17] Updated "stratify_regression_dataset_indices" function
 to return fraction of labels from each bin

Previous code: A fraction of labels were selected from the sorted values. Specifically, for biomass, it was selecting samples with the lowest biomass.
---
 pangaea/utils/subset_sampler.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py
index 9d6290e1..86b6a008 100644
--- a/pangaea/utils/subset_sampler.py
+++ b/pangaea/utils/subset_sampler.py
@@ -25,7 +25,7 @@ def calculate_regression_distributions(dataset):
     # Adding a progress bar for dataset processing
     for idx in tqdm(range(len(dataset)), desc="Calculating regression distributions per sample"):
         target = dataset[idx]['target']
-        mean_value = target.mean().item()  # Example for mean; adjust as needed for other metrics
+        mean_value = target.mean().item()  # Example for patch-wise mean; adjust as needed for other metrics
         distributions.append(mean_value)
 
     return np.array(distributions)
@@ -79,16 +79,33 @@ def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3,
 
     # Step 2: Bin the regression distributions
     binned_distributions = bin_regression_distributions(regression_distributions, num_bins=num_bins, logger=logger)
+
+    # Step 3: Prep a dictionary to hold indices for each bin
+    indices_per_bin = {i: [] for i in range(num_bins)}
+
+    # Step 4: Populate the indices per bin
+    for index, bin_index in enumerate(binned_distributions):
+        if bin_index in indices_per_bin:
+            indices_per_bin[bin_index].append(index)
     
-    # Step 3: Sort the indices based on binned distributions for stratification
-    sorted_indices = np.argsort(binned_distributions)
+    # Step 5: Select fraction of indices from each bin
+    selected_idx = []
+    for bin_index, indices in indices_per_bin.items():
+        num_to_select = int(max(1, len(indices)*label_fraction) ) # To ensure at least one index is selected
+        selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))        
+    other_idx = list(set(range(len(dataset))) - set(selected_idx))
     
-    # Step 4: Select a subset of labeled data with progress tracking
-    num_labeled = int(len(dataset) * label_fraction)
-    labeled_idx = sorted_indices[:num_labeled]
-    unlabeled_idx = sorted_indices[num_labeled:]
+    return selected_idx, other_idx
+    
+    # # Step 3: Sort the indices based on binned distributions for stratification
+    # sorted_indices = np.argsort(binned_distributions)
+    
+    # # Step 4: Select a subset of labeled data with progress tracking
+    # num_labeled = int(len(dataset) * label_fraction)
+    # labeled_idx = sorted_indices[:num_labeled]
+    # unlabeled_idx = sorted_indices[num_labeled:]
 
-    return labeled_idx, unlabeled_idx
+    # return labeled_idx, unlabeled_idx
 
 
 # Function to get subset indices based on the strategy, supporting both classification and regression

From 927eec65d6706f013c8e8d0df0d69fe1fc383c0e Mon Sep 17 00:00:00 2001
From: Ali Shibli <alishibli97@hotmail.com>
Date: Thu, 10 Oct 2024 11:05:28 +0200
Subject: [PATCH 10/17] adding segmentation stratification

---
 pangaea/utils/subset_sampler.py | 52 +++++++++++++++++----------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py
index 86b6a008..fca604b0 100644
--- a/pangaea/utils/subset_sampler.py
+++ b/pangaea/utils/subset_sampler.py
@@ -2,7 +2,6 @@
 from tqdm import tqdm
 import numpy as np
 
-
 # Function to calculate class distributions for classification with a progress bar
 def calculate_class_distributions(dataset, num_classes):
     class_distributions = []
@@ -50,26 +49,36 @@ def bin_regression_distributions(regression_distributions, num_bins=3, logger=No
     return binned_distributions
 
 
-# Function to perform stratification for classification and return only the indices
+# Updated function to perform stratification for classification and return only the indices, with even bin selection
 def stratify_classification_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None):
     # Step 1: Calculate class distributions with progress tracking
     class_distributions = calculate_class_distributions(dataset, num_classes)
 
     # Step 2: Bin the class distributions
     binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins, logger=logger)
-    
-    # Step 3: Combine the bins to use for stratification
+
+    # Step 3: Prep a dictionary to hold indices for each bin combination
+    indices_per_bin = {}
+
+    # Combine the bins for each class to create unique bin identifiers
     combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions)
 
-    # Step 4: Select a subset of labeled data with progress tracking
-    num_labeled = int(len(dataset) * label_fraction)
+    # Populate the dictionary with indices based on combined bin identifiers
+    for idx, bin_id in enumerate(combined_bins):
+        if bin_id not in indices_per_bin:
+            indices_per_bin[bin_id] = []
+        indices_per_bin[bin_id].append(idx)
 
-    # Sort the indices based on combined bins to preserve class distribution
-    sorted_indices = np.argsort(combined_bins)
-    labeled_idx = sorted_indices[:num_labeled]
-    unlabeled_idx = sorted_indices[num_labeled:]
+    # Step 4: Select a proportion of indices from each bin
+    selected_idx = []
+    for bin_id, indices in indices_per_bin.items():
+        num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
+        selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
 
-    return labeled_idx, unlabeled_idx
+    # Step 5: Determine the remaining indices not selected
+    other_idx = list(set(range(len(dataset))) - set(selected_idx))
+
+    return selected_idx, other_idx
 
 
 # Function to perform stratification for regression and return only the indices
@@ -87,25 +96,16 @@ def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3,
     for index, bin_index in enumerate(binned_distributions):
         if bin_index in indices_per_bin:
             indices_per_bin[bin_index].append(index)
-    
+
     # Step 5: Select fraction of indices from each bin
     selected_idx = []
     for bin_index, indices in indices_per_bin.items():
-        num_to_select = int(max(1, len(indices)*label_fraction) ) # To ensure at least one index is selected
-        selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))        
+        num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
+        selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
+
     other_idx = list(set(range(len(dataset))) - set(selected_idx))
-    
-    return selected_idx, other_idx
-    
-    # # Step 3: Sort the indices based on binned distributions for stratification
-    # sorted_indices = np.argsort(binned_distributions)
-    
-    # # Step 4: Select a subset of labeled data with progress tracking
-    # num_labeled = int(len(dataset) * label_fraction)
-    # labeled_idx = sorted_indices[:num_labeled]
-    # unlabeled_idx = sorted_indices[num_labeled:]
 
-    # return labeled_idx, unlabeled_idx
+    return selected_idx, other_idx
 
 
 # Function to get subset indices based on the strategy, supporting both classification and regression
@@ -128,3 +128,5 @@ def get_subset_indices(dataset, strategy="random", label_fraction=0.5, num_bins=
         )
     
     return indices
+
+

From 04f783b4c6e8c6921c5f18151ff1b5d5f9375d0a Mon Sep 17 00:00:00 2001
From: yurujaja <guojiangjyr@gmail.com>
Date: Thu, 10 Oct 2024 13:23:04 +0200
Subject: [PATCH 11/17] deep copy ckpt

---
 pangaea/engine/trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pangaea/engine/trainer.py b/pangaea/engine/trainer.py
index da55a685..2a3f37c7 100644
--- a/pangaea/engine/trainer.py
+++ b/pangaea/engine/trainer.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 import operator
 import os
@@ -193,7 +194,7 @@ def get_checkpoint(self, epoch: int) -> dict[str, dict | int]:
             "scaler": self.scaler.state_dict(),
             "epoch": epoch,
         }
-        return checkpoint
+        return copy.deepcopy(checkpoint)
 
     def save_model(
         self,

From b3b13784f64ad6deeeb62918942278de1b3b2627 Mon Sep 17 00:00:00 2001
From: Ritu Yadav <40523539+RituYadav92@users.noreply.github.com>
Date: Thu, 10 Oct 2024 14:45:23 +0200
Subject: [PATCH 12/17] synched the steps between classification and regression

---
 pangaea/utils/subset_sampler.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py
index fca604b0..7256dc9d 100644
--- a/pangaea/utils/subset_sampler.py
+++ b/pangaea/utils/subset_sampler.py
@@ -63,19 +63,19 @@ def stratify_classification_dataset_indices(dataset, num_classes, label_fraction
     # Combine the bins for each class to create unique bin identifiers
     combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions)
 
-    # Populate the dictionary with indices based on combined bin identifiers
+    # Step 4: Populate the dictionary with indices based on combined bin identifiers
     for idx, bin_id in enumerate(combined_bins):
         if bin_id not in indices_per_bin:
             indices_per_bin[bin_id] = []
         indices_per_bin[bin_id].append(idx)
 
-    # Step 4: Select a proportion of indices from each bin
+    # Step 5: Select a fraction of indices from each bin
     selected_idx = []
     for bin_id, indices in indices_per_bin.items():
         num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
         selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
 
-    # Step 5: Determine the remaining indices not selected
+    # Step 6: List the remaining unselected indices
     other_idx = list(set(range(len(dataset))) - set(selected_idx))
 
     return selected_idx, other_idx
@@ -97,12 +97,13 @@ def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3,
         if bin_index in indices_per_bin:
             indices_per_bin[bin_index].append(index)
 
-    # Step 5: Select fraction of indices from each bin
+    # Step 5: Select a fraction of indices from each bin
     selected_idx = []
     for bin_index, indices in indices_per_bin.items():
         num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
         selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
-
+        
+    # Step 6: List the remaining unselected indices
     other_idx = list(set(range(len(dataset))) - set(selected_idx))
 
     return selected_idx, other_idx

From bdc7dd6fba663f70bbd10ea3f08735347cd8669b Mon Sep 17 00:00:00 2001
From: yurujaja <guojiangjyr@gmail.com>
Date: Thu, 10 Oct 2024 16:03:13 +0200
Subject: [PATCH 13/17] enable stratified sampling and oversampling

---
 configs/train.yaml              |   3 +-
 pangaea/engine/trainer.py       |   5 +-
 pangaea/run.py                  |  23 ++++--
 pangaea/utils/subset_sampler.py | 137 ++++++++++++++++++++------------
 4 files changed, 104 insertions(+), 64 deletions(-)

diff --git a/configs/train.yaml b/configs/train.yaml
index 5f47d1ed..239db1f8 100644
--- a/configs/train.yaml
+++ b/configs/train.yaml
@@ -14,10 +14,11 @@ finetune: false
 ckpt_dir: null
 limited_label_train: 1
 limited_label_val: 1
-limited_label_strategy: stratified_classification  # stratified_regression, random
+limited_label_strategy: stratified  # Options: stratified, oversampled, random
 stratification_bins: 3 #  number of bins for stratified sampling, only for stratified
 
 
+
 defaults:
   - task: ???
   - dataset: ???
diff --git a/pangaea/engine/trainer.py b/pangaea/engine/trainer.py
index e4e0f1d8..d10cec40 100644
--- a/pangaea/engine/trainer.py
+++ b/pangaea/engine/trainer.py
@@ -78,10 +78,7 @@ def __init__(
         self.training_metrics = {}
         self.best_ckpt = None
         self.best_metric_comp = operator.gt
-        if isinstance(self.train_loader.dataset, Subset):
-            self.num_classes = self.train_loader.dataset.dataset.num_classes
-        else:
-            self.num_classes = self.train_loader.dataset.num_classes
+        self.num_classes = self.train_loader.dataset.num_classes
 
         assert precision in [
             "fp32",
diff --git a/pangaea/run.py b/pangaea/run.py
index 213b6d49..a5e3687d 100644
--- a/pangaea/run.py
+++ b/pangaea/run.py
@@ -29,21 +29,30 @@
 from pangaea.datasets.base import GeoFMSubset
 
 
-def get_exp_name(hydra_config: HydraConf) -> str:
+def get_exp_info(hydra_config: HydraConf) -> str:
     """Create a unique experiment name based on the choices made in the config.
 
     Args:
         hydra_config (HydraConf): hydra config.
 
     Returns:
-        str: experiment name.
+        str: experiment information.
     """
     choices = OmegaConf.to_container(hydra_config.runtime.choices)
     timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
     fm = choices["encoder"]
     decoder = choices["decoder"]
     ds = choices["dataset"]
-    return f"{timestamp}-{fm}-{decoder}-{ds}"
+    task = choices["task"]
+    exp_info = {
+        "timestamp": timestamp,
+        "fm": fm,
+        "decoder": decoder,
+        "ds": ds,
+        "task": task,
+        "exp_name": f"{timestamp}_{fm}_{decoder}_{ds}",
+    }
+    return exp_info
 
 
 @hydra.main(version_base=None, config_path="../configs", config_name="train")
@@ -66,7 +75,9 @@ def main(cfg: DictConfig) -> None:
     # true if training else false
     train_run = cfg.train
     if train_run:
-        exp_name = get_exp_name(HydraConfig.get())
+        exp_info = get_exp_info(HydraConfig.get())
+        exp_name = exp_info["exp_name"]
+        task_name = exp_info["task"]
         exp_dir = pathlib.Path(cfg.work_dir) / exp_name
         exp_dir.mkdir(parents=True, exist_ok=True)
         logger_path = exp_dir / "train.log"
@@ -152,14 +163,14 @@ def main(cfg: DictConfig) -> None:
 
         if 0 < cfg.limited_label_train < 1:
             indices = get_subset_indices(
-                train_dataset, strategy=cfg.limited_label_strategy, 
+                train_dataset, task=task_name, strategy=cfg.limited_label_strategy, 
                 label_fraction=cfg.limited_label_train, num_bins=cfg.stratification_bins, logger=logger
             )
             train_dataset = GeoFMSubset(train_dataset, indices)
             
         if 0 < cfg.limited_label_val < 1:
             indices = get_subset_indices(
-                val_dataset, strategy=cfg.limited_label_strategy, 
+                val_dataset, task=task_name, strategy=cfg.limited_label_strategy, 
                 label_fraction=cfg.limited_label_val, num_bins=cfg.stratification_bins, logger=logger
             )
             val_dataset = GeoFMSubset(val_dataset, indices)
diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py
index fca604b0..b6cba9b7 100644
--- a/pangaea/utils/subset_sampler.py
+++ b/pangaea/utils/subset_sampler.py
@@ -1,27 +1,39 @@
 import random
 from tqdm import tqdm
 import numpy as np
+from pangaea.datasets.base import GeoFMDataset
+from pangaea.datasets.base import GeoFMSubset
 
-# Function to calculate class distributions for classification with a progress bar
-def calculate_class_distributions(dataset, num_classes):
+# Calculate image-wise class distributions for segmentation
+def calculate_class_distributions(dataset: GeoFMDataset|GeoFMSubset):
+    num_classes = dataset.num_classes
+    ignore_index = dataset.ignore_index
     class_distributions = []
 
-    # Adding a progress bar for dataset processing
     for idx in tqdm(range(len(dataset)), desc="Calculating class distributions per sample"):
         target = dataset[idx]['target']
+
+        if ignore_index is not None:
+            target=target[(target != ignore_index)]
+
         total_pixels = target.numel()
-        class_counts = [(target == i).sum().item() for i in range(num_classes)]
-        class_ratios = [count / total_pixels for count in class_counts]
-        class_distributions.append(class_ratios)
+        if total_pixels == 0:
+            class_distributions.append([0] * num_classes)
+            continue
+        else:
+            class_counts = [(target == i).sum().item() for i in range(num_classes)]
+            class_ratios = [count / total_pixels for count in class_counts]
+            class_distributions.append(class_ratios)
+    
+    print(np.mean(class_distributions, axis=0))
 
     return np.array(class_distributions)
 
 
-# Function to calculate distribution metrics for regression
-def calculate_regression_distributions(dataset):
+# Calculate image-wise distributions for regression
+def calculate_regression_distributions(dataset: GeoFMDataset|GeoFMSubset):
     distributions = []
 
-    # Adding a progress bar for dataset processing
     for idx in tqdm(range(len(dataset)), desc="Calculating regression distributions per sample"):
         target = dataset[idx]['target']
         mean_value = target.mean().item()  # Example for patch-wise mean; adjust as needed for other metrics
@@ -30,15 +42,15 @@ def calculate_regression_distributions(dataset):
     return np.array(distributions)
 
 
-# Function to bin class distributions with a progress bar
+# Function to bin class distributions 
 def bin_class_distributions(class_distributions, num_bins=3, logger=None):
     logger.info(f"Class distributions are being binned into {num_bins} categories")
-    # Adding a progress bar for binning class distributions
+    
     binned_distributions = np.digitize(class_distributions, np.linspace(0, 1, num_bins+1)) - 1
     return binned_distributions
 
 
-# Function to bin regression distributions with a progress bar
+# Function to bin regression distributions
 def bin_regression_distributions(regression_distributions, num_bins=3, logger=None):
     logger.info(f"Regression distributions are being binned into {num_bins} categories")
     # Define the range for binning based on minimum and maximum values in regression distributions
@@ -49,59 +61,69 @@ def bin_regression_distributions(regression_distributions, num_bins=3, logger=No
     return binned_distributions
 
 
-# Updated function to perform stratification for classification and return only the indices, with even bin selection
-def stratify_classification_dataset_indices(dataset, num_classes, label_fraction=1.0, num_bins=3, logger=None):
-    # Step 1: Calculate class distributions with progress tracking
-    class_distributions = calculate_class_distributions(dataset, num_classes)
+def balance_seg_indices(
+        dataset:GeoFMDataset|GeoFMSubset, 
+        strategy, 
+        label_fraction=1.0, 
+        num_bins=3, 
+        logger=None):
+    # Calculate class distributions with progress tracking
+    class_distributions = calculate_class_distributions(dataset)
 
-    # Step 2: Bin the class distributions
+    # Bin the class distributions
     binned_distributions = bin_class_distributions(class_distributions, num_bins=num_bins, logger=logger)
-
-    # Step 3: Prep a dictionary to hold indices for each bin combination
-    indices_per_bin = {}
-
-    # Combine the bins for each class to create unique bin identifiers
     combined_bins = np.apply_along_axis(lambda row: ''.join(map(str, row)), axis=1, arr=binned_distributions)
 
-    # Populate the dictionary with indices based on combined bin identifiers
+    indices_per_bin = {}
     for idx, bin_id in enumerate(combined_bins):
         if bin_id not in indices_per_bin:
             indices_per_bin[bin_id] = []
         indices_per_bin[bin_id].append(idx)
 
-    # Step 4: Select a proportion of indices from each bin
-    selected_idx = []
-    for bin_id, indices in indices_per_bin.items():
-        num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
-        selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
-
-    # Step 5: Determine the remaining indices not selected
+    if strategy == "stratified":
+        # Select a proportion of indices from each bin   
+        selected_idx = []
+        for bin_id, indices in indices_per_bin.items():
+            num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
+            selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
+    elif strategy == "oversampled":
+        # Prioritize the bins with the lowest values
+        sorted_indices = np.argsort(combined_bins)
+        selected_idx = sorted_indices[:int(len(dataset) * label_fraction)]
+
+    # Determine the remaining indices not selected
     other_idx = list(set(range(len(dataset))) - set(selected_idx))
 
     return selected_idx, other_idx
 
 
-# Function to perform stratification for regression and return only the indices
-def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3, logger=None):
-    # Step 1: Calculate regression distributions with progress tracking
-    regression_distributions = calculate_regression_distributions(dataset)
+def balance_reg_indices(
+        dataset:GeoFMDataset|GeoFMSubset, 
+        strategy, 
+        label_fraction=1.0, 
+        num_bins=3, 
+        logger=None):
 
-    # Step 2: Bin the regression distributions
+    regression_distributions = calculate_regression_distributions(dataset)
     binned_distributions = bin_regression_distributions(regression_distributions, num_bins=num_bins, logger=logger)
 
-    # Step 3: Prep a dictionary to hold indices for each bin
     indices_per_bin = {i: [] for i in range(num_bins)}
 
-    # Step 4: Populate the indices per bin
+    # Populate the indices per bin
     for index, bin_index in enumerate(binned_distributions):
         if bin_index in indices_per_bin:
             indices_per_bin[bin_index].append(index)
-
-    # Step 5: Select fraction of indices from each bin
-    selected_idx = []
-    for bin_index, indices in indices_per_bin.items():
-        num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
-        selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
+    
+    if strategy == "stratified":
+        # Select fraction of indices from each bin
+        selected_idx = []
+        for bin_index, indices in indices_per_bin.items():
+            num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
+            selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
+    elif strategy == "oversampled":
+        # Prioritize the bins with the lowest values
+        sorted_indices = np.argsort(binned_distributions)
+        selected_idx = sorted_indices[:int(len(dataset) * label_fraction)]
 
     other_idx = list(set(range(len(dataset))) - set(selected_idx))
 
@@ -109,23 +131,32 @@ def stratify_regression_dataset_indices(dataset, label_fraction=1.0, num_bins=3,
 
 
 # Function to get subset indices based on the strategy, supporting both classification and regression
-def get_subset_indices(dataset, strategy="random", label_fraction=0.5, num_bins=3, logger=None):
+def get_subset_indices(dataset: GeoFMDataset, 
+                       task="segmentation",
+                       strategy="random", 
+                       label_fraction=0.5, 
+                       num_bins=3, 
+                       logger=None):
     logger.info(
         f"Creating a subset of the {dataset.split} dataset using {strategy} strategy, with {label_fraction * 100}% of labels utilized."
     )
-    if strategy == "stratified_classification":
-        indices, _ = stratify_classification_dataset_indices(
-            dataset, num_classes=dataset.num_classes, label_fraction=label_fraction, num_bins=num_bins, logger=logger
-        )
-    elif strategy == "stratified_regression":
-        indices, _ = stratify_regression_dataset_indices(
-            dataset, label_fraction=label_fraction, num_bins=num_bins, logger=logger
-        )
-    else:  # Default to random sampling
+    assert strategy in ["random", "stratified", "oversampled"], "Unsupported dataset subsampling strategy"
+    
+    if strategy == "random":
         n_samples = len(dataset)
         indices = random.sample(
             range(n_samples), int(n_samples * label_fraction)
         )
+        return indices
+    
+    elif task == "segmentation":
+        indices, _ = balance_seg_indices(
+            dataset, strategy=strategy, label_fraction=label_fraction, num_bins=num_bins, logger=logger
+        )
+    elif task == "regression":
+        indices, _ = balance_reg_indices(
+            dataset, strategy=strategy, label_fraction=label_fraction, num_bins=num_bins, logger=logger
+        )
     
     return indices
 

From 28238dbd08c45b9fe1d9824ea6a90614f508c8b9 Mon Sep 17 00:00:00 2001
From: yurujaja <guojiangjyr@gmail.com>
Date: Thu, 10 Oct 2024 16:39:10 +0200
Subject: [PATCH 14/17] add docstring

---
 pangaea/utils/subset_sampler.py | 65 ++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py
index b6cba9b7..06b5e8d6 100644
--- a/pangaea/utils/subset_sampler.py
+++ b/pangaea/utils/subset_sampler.py
@@ -24,8 +24,6 @@ def calculate_class_distributions(dataset: GeoFMDataset|GeoFMSubset):
             class_counts = [(target == i).sum().item() for i in range(num_classes)]
             class_ratios = [count / total_pixels for count in class_counts]
             class_distributions.append(class_ratios)
-    
-    print(np.mean(class_distributions, axis=0))
 
     return np.array(class_distributions)
 
@@ -67,6 +65,37 @@ def balance_seg_indices(
         label_fraction=1.0, 
         num_bins=3, 
         logger=None):
+    """
+    Balances and selects indices from a segmentation dataset based on the specified strategy.
+
+    Args:
+    dataset : GeoFMDataset | GeoFMSubset
+        The dataset from which to select indices, typically containing geospatial segmentation data.
+    
+    strategy : str
+        The strategy to use for selecting indices. Options include:
+        - "stratified": Proportionally selects indices from each class bin based on the class distribution.
+        - "oversampled": Prioritizes and selects indices from bins with lower class representation.
+    
+    label_fraction : float, optional, default=1.0
+        The fraction of labels (indices) to select from each class or bin. Values should be between 0 and 1.
+    
+    num_bins : int, optional, default=3
+        The number of bins to divide the class distributions into, used for stratification or oversampling.
+    
+    logger : object, optional
+        A logger object for tracking progress or logging messages (e.g., `logging.Logger`)
+
+    ------
+    
+    Returns:
+    selected_idx : list of int
+        The indices of the selected samples based on the strategy and label fraction.
+
+    other_idx : list of int
+        The remaining indices that were not selected.
+
+    """
     # Calculate class distributions with progress tracking
     class_distributions = calculate_class_distributions(dataset)
 
@@ -104,6 +133,38 @@ def balance_reg_indices(
         num_bins=3, 
         logger=None):
 
+    """
+    Balances and selects indices from a regression dataset based on the specified strategy.
+
+    Args:
+    dataset : GeoFMDataset | GeoFMSubset
+        The dataset from which to select indices, typically containing geospatial regression data.
+    
+    strategy : str
+        The strategy to use for selecting indices. Options include:
+        - "stratified": Proportionally selects indices from each bin based on the binned regression distributions.
+        - "oversampled": Prioritizes and selects indices from bins with lower representation.
+    
+    label_fraction : float, optional, default=1.0
+        The fraction of indices to select from each bin. Values should be between 0 and 1.
+    
+    num_bins : int, optional, default=3
+        The number of bins to divide the regression distributions into, used for stratification or oversampling.
+    
+    logger : object, optional
+        A logger object for tracking progress or logging messages (e.g., `logging.Logger`). If None, no logging is performed.
+    
+    ------
+    
+    Returns:
+    selected_idx : list of int
+        The indices of the selected samples based on the strategy and label fraction.
+
+    other_idx : list of int
+        The remaining indices that were not selected.
+
+    """
+
     regression_distributions = calculate_regression_distributions(dataset)
     binned_distributions = bin_regression_distributions(regression_distributions, num_bins=num_bins, logger=logger)
 

From afa76a0493ac08f255275f5eb66b6219f6a467bc Mon Sep 17 00:00:00 2001
From: Valerio Marsocci <49810041+VMarsocci@users.noreply.github.com>
Date: Thu, 10 Oct 2024 16:55:30 +0200
Subject: [PATCH 15/17] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7bd90efc..78f5c73c 100644
--- a/README.md
+++ b/README.md
@@ -265,7 +265,7 @@ torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \
    --config-name=train \
    dataset=sen1floods11 \
    encoder=unet_encoder \
-   decoder=unet \
+   decoder=seg_unet \
    preprocessing=seg_default \
    criterion=cross_entropy \
    task=segmentation \

From 9159a83462926b2e8ad21292e1b74c85e4343883 Mon Sep 17 00:00:00 2001
From: Ritu Yadav <40523539+RituYadav92@users.noreply.github.com>
Date: Thu, 10 Oct 2024 17:02:07 +0200
Subject: [PATCH 16/17] Added comment to guide oversampling for biomass or
 regression in general

---
 pangaea/utils/subset_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py
index 06b5e8d6..b73a218e 100644
--- a/pangaea/utils/subset_sampler.py
+++ b/pangaea/utils/subset_sampler.py
@@ -182,7 +182,7 @@ def balance_reg_indices(
             num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
             selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
     elif strategy == "oversampled":
-        # Prioritize the bins with the lowest values
+        # Prioritize the bins with the lowest values. Can change to prioritize high values ( ex: high biomass samples) 
         sorted_indices = np.argsort(binned_distributions)
         selected_idx = sorted_indices[:int(len(dataset) * label_fraction)]
 

From 424535c3743faf789ab7626f8c1b45fc52e16ca8 Mon Sep 17 00:00:00 2001
From: Yuru Jia <91590963+yurujaja@users.noreply.github.com>
Date: Thu, 10 Oct 2024 17:08:43 +0200
Subject: [PATCH 17/17] Update a comment

---
 pangaea/utils/subset_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea/utils/subset_sampler.py b/pangaea/utils/subset_sampler.py
index b73a218e..ffbf763d 100644
--- a/pangaea/utils/subset_sampler.py
+++ b/pangaea/utils/subset_sampler.py
@@ -182,7 +182,7 @@ def balance_reg_indices(
             num_to_select = int(max(1, len(indices) * label_fraction))  # Ensure at least one index is selected
             selected_idx.extend(np.random.choice(indices, num_to_select, replace=False))
     elif strategy == "oversampled":
-        # Prioritize the bins with the lowest values. Can change to prioritize high values ( ex: high biomass samples) 
+        # Prioritize bins with underrepresented values (e.g., high biomass samples)
         sorted_indices = np.argsort(binned_distributions)
         selected_idx = sorted_indices[:int(len(dataset) * label_fraction)]